diff --git a/clang/test/CodeGenHIP/default-attributes.hip b/clang/test/CodeGenHIP/default-attributes.hip index ee16ecd134bfee..63572bfd242b98 100644 --- a/clang/test/CodeGenHIP/default-attributes.hip +++ b/clang/test/CodeGenHIP/default-attributes.hip @@ -8,68 +8,49 @@ #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) -//. -// OPTNONE: @__hip_cuid_ = addrspace(1) global i8 0 -// OPTNONE: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata" -// OPTNONE: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 -//. -// OPT: @__hip_cuid_ = addrspace(1) global i8 0 -// OPT: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 -// OPT: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata" -//. -__device__ void extern_func(); - // OPTNONE: Function Attrs: convergent mustprogress noinline nounwind optnone // OPTNONE-LABEL: define {{[^@]+}}@_Z4funcv // OPTNONE-SAME: () #[[ATTR0:[0-9]+]] { // OPTNONE-NEXT: entry: -// OPTNONE-NEXT: call void @_Z11extern_funcv() #[[ATTR3:[0-9]+]] // OPTNONE-NEXT: ret void // -// OPT: Function Attrs: convergent mustprogress nounwind +// OPT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) // OPT-LABEL: define {{[^@]+}}@_Z4funcv // OPT-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] { // OPT-NEXT: entry: -// OPT-NEXT: tail call void @_Z11extern_funcv() #[[ATTR3:[0-9]+]] // OPT-NEXT: ret void // __device__ void func() { - extern_func(); + } // OPTNONE: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone // OPTNONE-LABEL: define {{[^@]+}}@_Z6kernelv -// OPTNONE-SAME: () #[[ATTR2:[0-9]+]] { +// OPTNONE-SAME: () #[[ATTR1:[0-9]+]] { // OPTNONE-NEXT: entry: -// OPTNONE-NEXT: call void @_Z11extern_funcv() #[[ATTR3]] // OPTNONE-NEXT: ret void // -// OPT: Function Attrs: convergent mustprogress norecurse nounwind +// OPT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) // OPT-LABEL: define {{[^@]+}}@_Z6kernelv -// OPT-SAME: () local_unnamed_addr #[[ATTR2:[0-9]+]] { +// OPT-SAME: () local_unnamed_addr #[[ATTR1:[0-9]+]] { // OPT-NEXT: entry: -// OPT-NEXT: tail call void @_Z11extern_funcv() #[[ATTR3]] // OPT-NEXT: ret void // __global__ void kernel() { - extern_func(); + } //. -// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -// OPTNONE: attributes #[[ATTR3]] = { convergent nounwind } +// OPTNONE: attributes #0 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// OPTNONE: attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } //. -// OPT: attributes #[[ATTR0]] = { convergent mustprogress nounwind "amdgpu-waves-per-eu"="4,10" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } -// OPT: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "amdgpu-waves-per-eu"="4,10" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } -// OPT: attributes #[[ATTR2]] = { convergent mustprogress norecurse nounwind "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -// OPT: attributes #[[ATTR3]] = { convergent nounwind } +// OPT: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } //. -// OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} -// OPTNONE: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} -// OPTNONE: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// OPTNONE: !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +// OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4} //. -// OPT: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} -// OPT: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} -// OPT: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// OPT: !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +// OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPT: !2 = !{i32 1, !"wchar_size", i32 4} //. diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 55b3b486d705df..1dd7fce2334c98 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -139,10 +139,6 @@ Changes to the AMDGPU Backend :ref:`atomicrmw ` instruction with `fadd`, `fmin` and `fmax` with addrspace(3) instead. -* AMDGPUAttributor is no longer run as part of the codegen pass - pipeline. It is expected to run as part of the middle end - optimizations. - Changes to the ARM Backend -------------------------- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 9ddf0a310ed061..f50a18ccc21885 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -731,14 +731,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); }); - // FIXME: Why is AMDGPUAttributor not in CGSCC? - PB.registerOptimizerLastEPCallback( - [this](ModulePassManager &MPM, OptimizationLevel Level) { - if (Level != OptimizationLevel::O0) { - MPM.addPass(AMDGPUAttributorPass(*this)); - } - }); - PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { // We want to support the -lto-partitions=N option as "best effort". @@ -1045,6 +1037,11 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createAMDGPULowerModuleLDSLegacyPass(&TM)); } + // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run + // after their introduction + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(createAMDGPUAttributorLegacyPass()); + if (TM.getOptLevel() > CodeGenOptLevel::None) addPass(createInferAddressSpacesPass()); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 8c951105101d96..97a8ff44866095 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -679,12 +679,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, break; } } - - // FIXME: We can spill incoming arguments and restore at the end of the - // prolog. - if (!ScratchWaveOffsetReg) - report_fatal_error( - "could not find temporary scratch offset register in prolog"); } else { ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll index 359c1e53de99e3..a38b6e3263882c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: s_add_u64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s6, s0 @@ -22,8 +22,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-LABEL: s_add_u64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] @@ -58,8 +58,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: s_sub_u64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s0, s6, s0 @@ -74,8 +74,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-LABEL: s_sub_u64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 0a8e805027c77a..9be8620b024eb7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -2026,7 +2026,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v1, v1 @@ -2056,7 +2056,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 @@ -2083,7 +2083,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v3, v1, v1 @@ -2114,14 +2114,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX10-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2147,11 +2143,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 @@ -2177,11 +2169,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2208,11 +2196,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -2239,11 +2223,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, v0 ; GFX7-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2278,7 +2258,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_num_f32 v3, v0, v0 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2305,7 +2285,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2331,7 +2311,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_f32 v3, v0, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2359,14 +2339,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2391,11 +2367,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2420,11 +2392,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2450,11 +2418,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2480,11 +2444,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2518,7 +2478,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen @@ -2549,7 +2509,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b32_e32 v6, s4 ; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, v1 ; GFX940-NEXT: buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen @@ -2578,7 +2538,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen @@ -2610,15 +2570,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen ; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2647,11 +2603,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2679,11 +2631,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2714,11 +2662,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2749,11 +2693,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2792,7 +2732,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen @@ -2821,7 +2761,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b32_e32 v6, s4 ; GFX940-NEXT: buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2848,7 +2788,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen @@ -2878,14 +2818,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2913,11 +2849,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 ; GFX90A-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2943,11 +2875,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2976,11 +2904,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -3009,11 +2933,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index fb81176a7419e9..97d68d9c2e6213 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -2026,7 +2026,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v1, v1 @@ -2056,7 +2056,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 @@ -2083,7 +2083,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v3, v1, v1 @@ -2114,14 +2114,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX10-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2147,11 +2143,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 @@ -2177,11 +2169,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2208,11 +2196,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -2239,11 +2223,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, v0 ; GFX7-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2278,7 +2258,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_num_f32 v3, v0, v0 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2305,7 +2285,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2331,7 +2311,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_f32 v3, v0, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2359,14 +2339,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2391,11 +2367,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2420,11 +2392,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2450,11 +2418,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2480,11 +2444,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2518,7 +2478,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen @@ -2549,7 +2509,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b32_e32 v6, s4 ; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, v1 ; GFX940-NEXT: buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen @@ -2578,7 +2538,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen @@ -2610,15 +2570,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen ; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2647,11 +2603,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2679,11 +2631,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2714,11 +2662,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2749,11 +2693,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2792,7 +2732,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen @@ -2821,7 +2761,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b32_e32 v6, s4 ; GFX940-NEXT: buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2848,7 +2788,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen @@ -2878,14 +2818,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2913,11 +2849,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 ; GFX90A-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2943,11 +2875,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2976,11 +2904,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -3009,11 +2933,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index 705bcbddf227a6..b04bc04ab22691 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -16,8 +16,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -31,8 +31,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,8 +46,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -59,11 +59,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -74,11 +74,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: ds_dec_rtn_u32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -95,8 +95,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -110,8 +110,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -125,8 +125,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -138,11 +138,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -153,11 +153,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -175,7 +175,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -197,7 +197,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -207,7 +207,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -218,7 +218,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: ds_dec_u32 v0, v1 @@ -232,7 +232,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -254,7 +254,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -264,7 +264,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: ds_dec_u32 v1, v0 offset:16 @@ -290,7 +290,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -305,7 +305,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -320,7 +320,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -332,7 +332,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -345,7 +345,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] glc @@ -364,7 +364,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -381,7 +381,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -410,7 +410,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -423,7 +423,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -443,7 +443,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -460,7 +460,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -477,7 +477,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -489,7 +489,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -502,7 +502,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -522,7 +522,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -534,7 +534,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -546,7 +546,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -569,7 +569,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] @@ -584,7 +584,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -598,7 +598,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -612,7 +612,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -623,7 +623,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -635,7 +635,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 @@ -651,7 +651,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -665,7 +665,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -679,7 +679,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -690,7 +690,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -702,7 +702,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 @@ -718,7 +718,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -740,7 +740,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -762,7 +762,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -774,7 +774,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -787,10 +787,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, v1, s[2:3] offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -812,7 +810,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -829,7 +827,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -846,7 +844,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -857,7 +855,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -869,10 +867,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[0:1] offset:20 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -889,7 +885,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -904,7 +900,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -919,7 +915,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -934,7 +930,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -950,7 +946,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -969,7 +965,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -986,7 +982,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1003,7 +999,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1018,7 +1014,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -1036,7 +1032,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1056,7 +1052,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -1073,7 +1069,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1090,7 +1086,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1105,7 +1101,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -1123,7 +1119,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1143,7 +1139,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1155,7 +1151,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1167,7 +1163,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1179,7 +1175,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1193,7 +1189,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1210,7 +1206,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1224,7 +1220,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1238,7 +1234,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1250,7 +1246,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -1266,7 +1262,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1284,7 +1280,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1298,7 +1294,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1312,7 +1308,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1324,7 +1320,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -1340,7 +1336,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1358,7 +1354,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1380,7 +1376,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1402,7 +1398,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1422,7 +1418,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1445,14 +1441,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_lshlrev_b32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_dec_u32 v3, v[0:1], v3 offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1476,7 +1470,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1493,7 +1487,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1510,7 +1504,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1525,7 +1519,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1544,14 +1538,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:20 @@ -1570,7 +1562,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1591,7 +1583,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1612,7 +1604,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1628,7 +1620,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,7 +1637,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -1665,7 +1657,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1688,7 +1680,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1711,7 +1703,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1727,7 +1719,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1746,7 +1738,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -1767,7 +1759,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1780,7 +1772,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1793,7 +1785,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,7 +1798,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1821,7 +1813,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1839,7 +1831,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1854,7 +1846,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1869,7 +1861,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1882,7 +1874,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1899,7 +1891,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1918,7 +1910,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1933,7 +1925,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1948,7 +1940,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1961,7 +1953,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1978,7 +1970,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1997,7 +1989,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -2023,7 +2015,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -2049,7 +2041,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2070,7 +2062,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -2094,15 +2086,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2126,7 +2117,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2144,7 +2135,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2162,7 +2153,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2178,7 +2169,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2198,15 +2189,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:40 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2224,7 +2214,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_dec_shl_base_lds_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; CI-NEXT: v_mov_b32_e32 v2, 9 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2242,7 +2232,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_shl_base_lds_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 9 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2260,7 +2250,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_dec_shl_base_lds_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2276,7 +2266,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 9 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 @@ -2289,10 +2279,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: atomic_dec_shl_base_lds_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 @@ -2317,8 +2305,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2333,8 +2321,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2349,8 +2337,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2363,12 +2351,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2379,12 +2367,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -2401,8 +2389,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2417,8 +2405,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2433,8 +2421,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2447,12 +2435,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2463,12 +2451,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -2486,7 +2474,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2498,7 +2486,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2510,7 +2498,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2521,7 +2509,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2533,7 +2521,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -2548,7 +2536,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2560,7 +2548,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2572,7 +2560,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2583,7 +2571,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2595,7 +2583,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -2611,7 +2599,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2627,7 +2615,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2643,7 +2631,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2656,7 +2644,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2670,7 +2658,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2690,7 +2678,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2708,7 +2696,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2726,7 +2714,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2739,7 +2727,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2753,7 +2741,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2774,7 +2762,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2792,7 +2780,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2810,7 +2798,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2823,7 +2811,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2837,7 +2825,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2858,7 +2846,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2871,7 +2859,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2884,7 +2872,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2896,7 +2884,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2909,7 +2897,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2925,7 +2913,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2940,7 +2928,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2955,7 +2943,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2967,7 +2955,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2980,7 +2968,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2997,7 +2985,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3012,7 +3000,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3027,7 +3015,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3039,7 +3027,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -3052,7 +3040,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3069,7 +3057,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3092,7 +3080,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -3115,7 +3103,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -3128,7 +3116,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -3142,17 +3130,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 glc +; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -3168,7 +3154,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -3186,7 +3172,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3204,7 +3190,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -3216,7 +3202,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -3229,13 +3215,11 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:40 +; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[0:1] offset:40 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3250,7 +3234,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_dec_shl_base_lds_0_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -3269,7 +3253,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_dec_shl_base_lds_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -3289,7 +3273,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3306,7 +3290,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -3319,21 +3303,18 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; GFX11-LABEL: atomic_dec_shl_base_lds_0_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 9 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 9 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v3, v[0:1] offset:16 +; GFX11-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b32 v3, v2, s[2:3] -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b32 v3, v0, s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index b3a7e65f771c43..f6a997fb0fb01b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -16,8 +16,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -31,8 +31,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,8 +46,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -59,11 +59,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -74,11 +74,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -95,8 +95,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -110,8 +110,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -125,8 +125,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -138,11 +138,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -153,11 +153,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -175,7 +175,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -197,7 +197,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -207,7 +207,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -218,7 +218,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: ds_inc_u32 v0, v1 @@ -232,7 +232,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -254,7 +254,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -264,7 +264,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: ds_inc_u32 v1, v0 offset:16 @@ -290,7 +290,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -305,7 +305,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -320,7 +320,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -332,7 +332,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -345,7 +345,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] glc @@ -364,7 +364,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -381,7 +381,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -410,7 +410,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -423,7 +423,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -443,7 +443,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -460,7 +460,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -477,7 +477,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -489,7 +489,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -502,7 +502,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -522,7 +522,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -534,7 +534,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -546,7 +546,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -569,7 +569,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] @@ -584,7 +584,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -598,7 +598,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -612,7 +612,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -623,7 +623,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -635,7 +635,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 @@ -651,7 +651,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -665,7 +665,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -679,7 +679,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -690,7 +690,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -702,7 +702,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 @@ -718,7 +718,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -740,7 +740,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -762,7 +762,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -774,7 +774,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -787,10 +787,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -812,7 +810,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -829,7 +827,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -846,7 +844,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -857,7 +855,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -869,10 +867,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:20 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -889,7 +885,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; CI-NEXT: v_mov_b32_e32 v2, 9 ; CI-NEXT: s_mov_b32 m0, -1 @@ -907,7 +903,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 9 ; VI-NEXT: s_mov_b32 m0, -1 @@ -925,7 +921,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -941,7 +937,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 9 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 @@ -954,10 +950,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; ; GFX11-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 @@ -982,8 +976,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -998,8 +992,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1014,8 +1008,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1028,12 +1022,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1044,12 +1038,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -1066,8 +1060,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1082,8 +1076,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1098,8 +1092,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1112,12 +1106,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1128,12 +1122,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -1151,7 +1145,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1163,7 +1157,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1175,7 +1169,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1186,7 +1180,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1198,7 +1192,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -1213,7 +1207,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1225,7 +1219,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1237,7 +1231,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1248,7 +1242,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1260,7 +1254,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -1276,7 +1270,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1292,7 +1286,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1308,7 +1302,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1321,7 +1315,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1335,7 +1329,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1355,7 +1349,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1373,7 +1367,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1391,7 +1385,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1404,7 +1398,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1418,7 +1412,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1439,7 +1433,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1457,7 +1451,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1475,7 +1469,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1488,7 +1482,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1502,7 +1496,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1523,7 +1517,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1536,7 +1530,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1549,7 +1543,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1561,7 +1555,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1574,7 +1568,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1590,7 +1584,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1605,7 +1599,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1620,7 +1614,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1632,7 +1626,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1645,7 +1639,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1662,7 +1656,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1677,7 +1671,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1692,7 +1686,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1704,7 +1698,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1717,7 +1711,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1734,7 +1728,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -1757,7 +1751,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1780,7 +1774,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -1793,7 +1787,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -1807,17 +1801,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 glc +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1833,7 +1825,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1851,7 +1843,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1869,7 +1861,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1881,7 +1873,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1894,13 +1886,11 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:40 +; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[0:1] offset:40 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -1915,7 +1905,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -1930,7 +1920,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1945,7 +1935,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1960,7 +1950,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -1976,7 +1966,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1995,7 +1985,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2012,7 +2002,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2029,7 +2019,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2044,7 +2034,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -2062,7 +2052,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2082,7 +2072,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2099,7 +2089,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2116,7 +2106,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2131,7 +2121,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -2149,7 +2139,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2169,7 +2159,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2181,7 +2171,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2193,7 +2183,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2205,7 +2195,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2219,7 +2209,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2236,7 +2226,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2250,7 +2240,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2264,7 +2254,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2276,7 +2266,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -2292,7 +2282,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2310,7 +2300,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2324,7 +2314,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2338,7 +2328,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2350,7 +2340,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -2366,7 +2356,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2384,7 +2374,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2406,7 +2396,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2428,7 +2418,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2448,7 +2438,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2471,14 +2461,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_lshlrev_b32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2502,7 +2490,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2519,7 +2507,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2536,7 +2524,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2551,7 +2539,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2570,14 +2558,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 @@ -2596,7 +2582,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -2615,7 +2601,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -2635,7 +2621,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2652,7 +2638,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -2665,21 +2651,18 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; GFX11-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 9 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 9 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v3, v[0:1] offset:16 +; GFX11-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b32 v3, v2, s[2:3] -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b32 v3, v0, s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2695,7 +2678,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2716,7 +2699,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2737,7 +2720,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2753,7 +2736,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2770,7 +2753,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2790,7 +2773,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2813,7 +2796,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2836,7 +2819,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2852,7 +2835,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2871,7 +2854,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2892,7 +2875,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2915,7 +2898,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2938,7 +2921,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2954,7 +2937,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2973,7 +2956,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2994,7 +2977,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3007,7 +2990,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3020,7 +3003,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3033,7 +3016,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3048,7 +3031,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3066,7 +3049,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3081,7 +3064,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3096,7 +3079,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3109,7 +3092,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3126,7 +3109,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3145,7 +3128,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3160,7 +3143,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3175,7 +3158,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3188,7 +3171,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3205,7 +3188,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3224,7 +3207,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3250,7 +3233,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -3276,7 +3259,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3297,7 +3280,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -3321,15 +3304,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3353,7 +3335,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -3371,7 +3353,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3389,7 +3371,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3405,7 +3387,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -3425,15 +3407,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3451,12 +3432,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: nocse_lds_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s6, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v1, s6 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0 @@ -3471,12 +3452,12 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; VI-LABEL: nocse_lds_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0 @@ -3491,11 +3472,11 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 @@ -3507,11 +3488,11 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX10-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3526,10 +3507,10 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX11-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll index c45bccd184c12f..bb5ccc3657dc4d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll @@ -66,7 +66,7 @@ define amdgpu_ps i32 @select_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inreg %a.1, define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; WAVE64-LABEL: sgpr_trunc_brcond: ; WAVE64: ; %bb.0: ; %entry -; WAVE64-NEXT: s_load_dword s0, s[2:3], 0x24 +; WAVE64-NEXT: s_load_dword s0, s[0:1], 0x24 ; WAVE64-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-NEXT: s_xor_b32 s0, s0, 1 ; WAVE64-NEXT: s_and_b32 s0, s0, 1 @@ -83,7 +83,7 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; ; WAVE32-LABEL: sgpr_trunc_brcond: ; WAVE32: ; %bb.0: ; %entry -; WAVE32-NEXT: s_load_dword s0, s[2:3], 0x24 +; WAVE32-NEXT: s_load_dword s0, s[0:1], 0x24 ; WAVE32-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-NEXT: s_xor_b32 s0, s0, 1 ; WAVE32-NEXT: s_and_b32 s0, s0, 1 @@ -113,7 +113,7 @@ bb1: define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) { ; WAVE64-LABEL: brcond_sgpr_trunc_and: ; WAVE64: ; %bb.0: ; %entry -; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; WAVE64-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-NEXT: s_and_b32 s0, s0, s1 ; WAVE64-NEXT: s_xor_b32 s0, s0, 1 @@ -131,7 +131,7 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) { ; ; WAVE32-LABEL: brcond_sgpr_trunc_and: ; WAVE32: ; %bb.0: ; %entry -; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; WAVE32-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-NEXT: s_and_b32 s0, s0, s1 ; WAVE32-NEXT: s_xor_b32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index e4c609c9331086..24652982c6584f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -12,9 +12,9 @@ declare hidden void @external_void_func_byval(ptr addrspace(5) byval([16 x i32]) define amdgpu_kernel void @kernel_caller_stack() { ; MUBUF-LABEL: kernel_caller_stack: ; MUBUF: ; %bb.0: -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_mov_b32 s32, 0 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 @@ -34,8 +34,8 @@ define amdgpu_kernel void @kernel_caller_stack() { ; FLATSCR-LABEL: kernel_caller_stack: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_mov_b32 s32, 0 -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_add_u32 s0, s32, 4 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 @@ -60,9 +60,9 @@ define amdgpu_kernel void @kernel_caller_stack() { define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-LABEL: kernel_caller_byval: ; MUBUF: ; %bb.0: -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -155,9 +155,9 @@ define amdgpu_kernel void @kernel_caller_byval() { ; ; FLATSCR-LABEL: kernel_caller_byval: ; FLATSCR: ; %bb.0: -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index 405b1e8f3a250f..eb20178f9f4d88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -452,7 +452,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -468,7 +468,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -493,7 +493,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -513,7 +513,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -539,7 +539,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -562,7 +562,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -589,7 +589,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -612,7 +612,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -644,7 +644,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -679,7 +679,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -725,14 +725,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] @@ -769,17 +769,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -821,7 +821,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -858,7 +858,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -949,7 +949,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -986,7 +986,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1005,7 +1005,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1051,7 +1051,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1080,7 +1080,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1204,7 +1204,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1221,7 +1221,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1310,7 +1310,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1337,7 +1337,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1354,7 +1354,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 5515de0cd2fee1..78d908455e019b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -193,7 +193,7 @@ bb12: define amdgpu_kernel void @break_loop(i32 %arg) { ; CHECK-LABEL: break_loop: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll index 48986ea9ef9825..96db1f889690df 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -25,8 +25,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s9 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -42,7 +42,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s32, 16 ; GFX11-NEXT: s_mov_b32 s33, 0 @@ -143,8 +143,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() { define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -160,8 +160,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s9 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -177,7 +177,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s32, 16 ; GFX11-NEXT: s_mov_b32 s33, 0 @@ -278,8 +278,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() { define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -296,8 +296,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s9 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -314,7 +314,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s32, 32 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s33, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 34efb089b72bf1..1e1c90d142a1f3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3037,21 +3037,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 6 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 1 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_info = 0 -; GPRIDX-NEXT: enable_vgpr_workitem_id = 2 +; GPRIDX-NEXT: enable_vgpr_workitem_id = 0 ; GPRIDX-NEXT: enable_exception_msb = 0 ; GPRIDX-NEXT: granulated_lds_size = 0 ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_id = 0 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3067,7 +3067,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 28 +; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 ; GPRIDX-NEXT: wavefront_sgpr_count = 13 ; GPRIDX-NEXT: workitem_vgpr_count = 3 @@ -3085,8 +3085,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GPRIDX-NEXT: s_load_dword s8, s[6:7], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8 ; GPRIDX-NEXT: s_mov_b32 s4, 0 ; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000 ; GPRIDX-NEXT: s_mov_b32 s2, 0 @@ -3128,21 +3128,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 6 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 1 +; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_info = 0 -; MOVREL-NEXT: enable_vgpr_workitem_id = 2 +; MOVREL-NEXT: enable_vgpr_workitem_id = 0 ; MOVREL-NEXT: enable_exception_msb = 0 ; MOVREL-NEXT: granulated_lds_size = 0 ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 0 ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_id = 0 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3158,7 +3158,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 28 +; MOVREL-NEXT: kernarg_segment_byte_size = 12 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 ; MOVREL-NEXT: wavefront_sgpr_count = 9 ; MOVREL-NEXT: workitem_vgpr_count = 4 @@ -3176,8 +3176,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8 ; MOVREL-NEXT: s_mov_b32 s4, 0 ; MOVREL-NEXT: s_mov_b32 s5, 0x40080000 ; MOVREL-NEXT: s_mov_b32 s2, 0 @@ -3209,7 +3209,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -3220,21 +3220,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 6 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX10-NEXT: enable_sgpr_workgroup_info = 0 -; GFX10-NEXT: enable_vgpr_workitem_id = 2 +; GFX10-NEXT: enable_vgpr_workitem_id = 0 ; GFX10-NEXT: enable_exception_msb = 0 ; GFX10-NEXT: granulated_lds_size = 0 ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 -; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX10-NEXT: enable_sgpr_dispatch_id = 1 +; GFX10-NEXT: enable_sgpr_dispatch_id = 0 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3250,9 +3250,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: kernarg_segment_byte_size = 12 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 9 +; GFX10-NEXT: wavefront_sgpr_count = 7 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -3269,21 +3269,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s8, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s8, 1 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 2 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_cmp_eq_u32 s8, 3 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s5, 0x40140000 ; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] -; GFX10-NEXT: s_cmp_eq_u32 s8, 4 +; GFX10-NEXT: s_cmp_eq_u32 s6, 4 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -3312,21 +3312,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: enable_mem_ordered = 1 ; GFX11-NEXT: enable_fwd_progress = 0 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX11-NEXT: user_sgpr_count = 13 +; GFX11-NEXT: user_sgpr_count = 15 ; GFX11-NEXT: enable_trap_handler = 0 ; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX11-NEXT: enable_sgpr_workgroup_info = 0 -; GFX11-NEXT: enable_vgpr_workitem_id = 2 +; GFX11-NEXT: enable_vgpr_workitem_id = 0 ; GFX11-NEXT: enable_exception_msb = 0 ; GFX11-NEXT: granulated_lds_size = 0 ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 -; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX11-NEXT: enable_sgpr_queue_ptr = 0 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX11-NEXT: enable_sgpr_dispatch_id = 1 +; GFX11-NEXT: enable_sgpr_dispatch_id = 0 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX11-NEXT: enable_sgpr_private_segment_size = 0 ; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3342,7 +3342,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: workitem_private_segment_byte_size = 0 ; GFX11-NEXT: workgroup_group_segment_byte_size = 0 ; GFX11-NEXT: gds_segment_byte_size = 0 -; GFX11-NEXT: kernarg_segment_byte_size = 28 +; GFX11-NEXT: kernarg_segment_byte_size = 12 ; GFX11-NEXT: workgroup_fbarrier_count = 0 ; GFX11-NEXT: wavefront_sgpr_count = 7 ; GFX11-NEXT: workitem_vgpr_count = 3 @@ -3361,8 +3361,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x40080000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -4054,21 +4054,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 6 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 1 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_info = 0 -; GPRIDX-NEXT: enable_vgpr_workitem_id = 2 +; GPRIDX-NEXT: enable_vgpr_workitem_id = 0 ; GPRIDX-NEXT: enable_exception_msb = 0 ; GPRIDX-NEXT: granulated_lds_size = 0 ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_id = 0 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4084,9 +4084,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 28 +; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 12 +; GPRIDX-NEXT: wavefront_sgpr_count = 10 ; GPRIDX-NEXT: workitem_vgpr_count = 2 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4102,8 +4102,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dword s2, s[6:7], 0x8 -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GPRIDX-NEXT: s_load_dword s2, s[4:5], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s2, 1 @@ -4138,21 +4138,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 6 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 1 +; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_info = 0 -; MOVREL-NEXT: enable_vgpr_workitem_id = 2 +; MOVREL-NEXT: enable_vgpr_workitem_id = 0 ; MOVREL-NEXT: enable_exception_msb = 0 ; MOVREL-NEXT: granulated_lds_size = 0 ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 0 ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_id = 0 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4168,9 +4168,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 28 +; MOVREL-NEXT: kernarg_segment_byte_size = 12 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 8 +; MOVREL-NEXT: wavefront_sgpr_count = 6 ; MOVREL-NEXT: workitem_vgpr_count = 3 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4186,8 +4186,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dword s2, s[6:7], 0x8 -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; MOVREL-NEXT: s_load_dword s2, s[4:5], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s2, 1 ; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0 @@ -4223,21 +4223,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 6 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX10-NEXT: enable_sgpr_workgroup_info = 0 -; GFX10-NEXT: enable_vgpr_workitem_id = 2 +; GFX10-NEXT: enable_vgpr_workitem_id = 0 ; GFX10-NEXT: enable_exception_msb = 0 ; GFX10-NEXT: granulated_lds_size = 0 ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 -; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX10-NEXT: enable_sgpr_dispatch_id = 1 +; GFX10-NEXT: enable_sgpr_dispatch_id = 0 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4253,9 +4253,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: kernarg_segment_byte_size = 12 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 8 +; GFX10-NEXT: wavefront_sgpr_count = 6 ; GFX10-NEXT: workitem_vgpr_count = 2 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -4272,8 +4272,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 @@ -4308,21 +4308,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_mem_ordered = 1 ; GFX11-NEXT: enable_fwd_progress = 0 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX11-NEXT: user_sgpr_count = 13 +; GFX11-NEXT: user_sgpr_count = 15 ; GFX11-NEXT: enable_trap_handler = 0 ; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX11-NEXT: enable_sgpr_workgroup_info = 0 -; GFX11-NEXT: enable_vgpr_workitem_id = 2 +; GFX11-NEXT: enable_vgpr_workitem_id = 0 ; GFX11-NEXT: enable_exception_msb = 0 ; GFX11-NEXT: granulated_lds_size = 0 ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 -; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX11-NEXT: enable_sgpr_queue_ptr = 0 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX11-NEXT: enable_sgpr_dispatch_id = 1 +; GFX11-NEXT: enable_sgpr_dispatch_id = 0 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX11-NEXT: enable_sgpr_private_segment_size = 0 ; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4338,9 +4338,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: workitem_private_segment_byte_size = 0 ; GFX11-NEXT: workgroup_group_segment_byte_size = 0 ; GFX11-NEXT: gds_segment_byte_size = 0 -; GFX11-NEXT: kernarg_segment_byte_size = 28 +; GFX11-NEXT: kernarg_segment_byte_size = 12 ; GFX11-NEXT: workgroup_fbarrier_count = 0 -; GFX11-NEXT: wavefront_sgpr_count = 5 +; GFX11-NEXT: wavefront_sgpr_count = 4 ; GFX11-NEXT: workitem_vgpr_count = 2 ; GFX11-NEXT: reserved_vgpr_first = 0 ; GFX11-NEXT: reserved_vgpr_count = 0 @@ -4357,16 +4357,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s4, 1 -; GFX11-NEXT: s_cselect_b32 s2, 2.0, 1.0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 2 -; GFX11-NEXT: s_cselect_b32 s2, 0x40400000, s2 -; GFX11-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-NEXT: s_cselect_b32 s2, 4.0, s2 +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: s_cselect_b32 s3, 2.0, 1.0 +; GFX11-NEXT: s_cmp_eq_u32 s2, 2 +; GFX11-NEXT: s_cselect_b32 s3, 0x40400000, s3 +; GFX11-NEXT: s_cmp_eq_u32 s2, 3 +; GFX11-NEXT: s_cselect_b32 s2, 4.0, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -4401,21 +4401,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 6 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 1 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_info = 0 -; GPRIDX-NEXT: enable_vgpr_workitem_id = 2 +; GPRIDX-NEXT: enable_vgpr_workitem_id = 0 ; GPRIDX-NEXT: enable_exception_msb = 0 ; GPRIDX-NEXT: granulated_lds_size = 0 ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_id = 0 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4431,9 +4431,9 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 28 +; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 13 +; GPRIDX-NEXT: wavefront_sgpr_count = 11 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4449,17 +4449,17 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dword s8, s[6:7], 0x8 -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GPRIDX-NEXT: s_load_dword s6, s[4:5], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: s_mov_b32 s2, 0 ; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 ; GPRIDX-NEXT: v_mov_b32_e32 v2, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1 ; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 2 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2 ; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 3 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 3 ; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 @@ -4477,7 +4477,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 0 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4488,21 +4488,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 6 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 1 +; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_info = 0 -; MOVREL-NEXT: enable_vgpr_workitem_id = 2 +; MOVREL-NEXT: enable_vgpr_workitem_id = 0 ; MOVREL-NEXT: enable_exception_msb = 0 ; MOVREL-NEXT: granulated_lds_size = 0 ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 0 ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_id = 0 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4518,9 +4518,9 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 28 +; MOVREL-NEXT: kernarg_segment_byte_size = 12 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 9 +; MOVREL-NEXT: wavefront_sgpr_count = 7 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4536,16 +4536,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; MOVREL-NEXT: s_load_dword s6, s[4:5], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_mov_b32 s2, 0 ; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 1 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 2 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 3 ; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 @@ -4565,7 +4565,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -4576,21 +4576,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 6 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX10-NEXT: enable_sgpr_workgroup_info = 0 -; GFX10-NEXT: enable_vgpr_workitem_id = 2 +; GFX10-NEXT: enable_vgpr_workitem_id = 0 ; GFX10-NEXT: enable_exception_msb = 0 ; GFX10-NEXT: granulated_lds_size = 0 ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 -; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX10-NEXT: enable_sgpr_dispatch_id = 1 +; GFX10-NEXT: enable_sgpr_dispatch_id = 0 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4606,9 +4606,9 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: kernarg_segment_byte_size = 12 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 9 +; GFX10-NEXT: wavefront_sgpr_count = 7 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -4625,17 +4625,17 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s8, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s8, 1 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 2 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_cmp_eq_u32 s8, 3 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 ; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -4664,21 +4664,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_mem_ordered = 1 ; GFX11-NEXT: enable_fwd_progress = 0 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX11-NEXT: user_sgpr_count = 13 +; GFX11-NEXT: user_sgpr_count = 15 ; GFX11-NEXT: enable_trap_handler = 0 ; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX11-NEXT: enable_sgpr_workgroup_info = 0 -; GFX11-NEXT: enable_vgpr_workitem_id = 2 +; GFX11-NEXT: enable_vgpr_workitem_id = 0 ; GFX11-NEXT: enable_exception_msb = 0 ; GFX11-NEXT: granulated_lds_size = 0 ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 -; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX11-NEXT: enable_sgpr_queue_ptr = 0 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX11-NEXT: enable_sgpr_dispatch_id = 1 +; GFX11-NEXT: enable_sgpr_dispatch_id = 0 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX11-NEXT: enable_sgpr_private_segment_size = 0 ; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4694,7 +4694,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: workitem_private_segment_byte_size = 0 ; GFX11-NEXT: workgroup_group_segment_byte_size = 0 ; GFX11-NEXT: gds_segment_byte_size = 0 -; GFX11-NEXT: kernarg_segment_byte_size = 28 +; GFX11-NEXT: kernarg_segment_byte_size = 12 ; GFX11-NEXT: workgroup_fbarrier_count = 0 ; GFX11-NEXT: wavefront_sgpr_count = 7 ; GFX11-NEXT: workitem_vgpr_count = 3 @@ -4713,8 +4713,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x40080000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll index 9b9249b62b0bca..7cd99fcfd5e740 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll @@ -1,8 +1,6 @@ -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 < %s | llc -mcpu=gfx900 | FileCheck -check-prefixes=GCN,RW-FLAT %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | llc | FileCheck -check-prefixes=GCN,RO-FLAT %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx940 < %s | llc | FileCheck -check-prefixes=GCN,RO-FLAT %s - -target triple = "amdgcn-amd-amdhsa" +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,RW-FLAT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GCN,RO-FLAT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,RO-FLAT %s ; Make sure flat_scratch_init is set diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index a5e4151bf36958..63e7339d829e1d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -8,9 +8,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 @@ -26,11 +26,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 @@ -46,7 +46,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 @@ -62,7 +62,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 @@ -78,7 +78,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b32 s1, s0, 2 @@ -105,10 +105,10 @@ bb: define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_add_u32_e32 v1, 0, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -121,10 +121,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-LABEL: store_load_vindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -139,7 +139,6 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX940-LABEL: store_load_vindex_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 @@ -153,12 +152,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0, v1 @@ -168,10 +165,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX12-LABEL: store_load_vindex_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -325,9 +321,9 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -345,11 +341,11 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -367,7 +363,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -385,7 +381,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v2, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -402,7 +398,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 @@ -434,8 +430,8 @@ bb: define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-LABEL: store_load_vindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -453,10 +449,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -475,7 +471,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 @@ -489,14 +484,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:256 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 @@ -506,13 +498,12 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX12-LABEL: store_load_vindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 +; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:256 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:380 scope:SCOPE_SYS @@ -639,9 +630,9 @@ bb: define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -659,11 +650,11 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -681,7 +672,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_large_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -700,7 +691,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -718,7 +709,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 @@ -750,8 +741,8 @@ bb: define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-LABEL: store_load_vindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -769,10 +760,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -791,7 +782,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 @@ -806,15 +796,12 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 @@ -824,13 +811,12 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX12-LABEL: store_load_vindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 +; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16508 scope:SCOPE_SYS @@ -959,8 +945,8 @@ bb: define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-LABEL: store_load_large_imm_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 @@ -976,10 +962,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3e80 @@ -1128,9 +1114,9 @@ bb: define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-LABEL: store_load_vidx_sidx_offset: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -1143,11 +1129,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-LABEL: store_load_vidx_sidx_offset: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -1160,8 +1146,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -1174,11 +1159,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1188,10 +1173,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-LABEL: store_load_vidx_sidx_offset: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 2d3b6ee3e9823a..632dbd45279fbe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -7,16 +7,14 @@ declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x h ; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) -declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) -declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -27,7 +25,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -43,7 +41,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -84,10 +82,10 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -109,10 +107,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -134,12 +132,12 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] +; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3] ; GFX940-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) ret void @@ -156,56 +154,6 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> ret <2 x i16> %ret } -define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX940-LABEL: local_atomic_fadd_v2f16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: ds_pk_add_f16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret void -} - -define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX940-LABEL: local_atomic_fadd_v2f16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret <2 x half> %ret -} - -define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: ds_pk_add_f16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { ; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset: ; GFX940: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 453b229bf62bd9..66b22bedaf0721 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -20,27 +20,26 @@ declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) -declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -72,12 +71,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -87,12 +86,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -108,22 +107,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -155,12 +154,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -170,12 +169,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -191,22 +190,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -238,12 +237,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -253,12 +252,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -274,22 +273,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -321,12 +320,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -336,12 +335,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -357,22 +356,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -404,12 +403,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -419,12 +418,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -440,22 +439,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -487,12 +486,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -502,12 +501,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -523,22 +522,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -570,12 +569,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -585,12 +584,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -606,22 +605,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -653,12 +652,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -668,12 +667,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -689,22 +688,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -736,12 +735,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -751,12 +750,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -772,22 +771,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -819,12 +818,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -834,12 +833,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -855,22 +854,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -902,12 +901,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -917,12 +916,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -938,22 +937,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -985,12 +984,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -1000,12 +999,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -1021,7 +1020,7 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1030,7 +1029,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1044,7 +1043,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1053,7 +1052,7 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1067,7 +1066,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1076,7 +1075,7 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1090,16 +1089,16 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: s_mov_b32 s0, s5 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB39_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 @@ -1126,22 +1125,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB39_2: @@ -1154,21 +1153,21 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB40_2: @@ -1176,22 +1175,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB40_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB40_2: @@ -1204,16 +1203,16 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: s_mov_b32 s0, s5 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB41_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 @@ -1240,22 +1239,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB41_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB41_2: @@ -1268,21 +1267,21 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB42_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB42_2: @@ -1290,22 +1289,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB42_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB42_2: @@ -1480,16 +1479,16 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: s_mov_b32 s0, s5 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB49_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 @@ -1514,22 +1513,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB49_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB49_2: @@ -1542,7 +1541,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] @@ -1566,7 +1565,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1583,7 +1582,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1595,7 +1594,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1612,7 +1611,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] @@ -1637,7 +1636,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1761,7 +1760,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -1770,7 +1769,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1803,7 +1802,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] @@ -1825,7 +1824,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1842,7 +1841,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -1851,7 +1850,7 @@ define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1884,7 +1883,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -1893,7 +1892,7 @@ define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1923,119 +1922,47 @@ main_body: ret double %ret } -define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) { -; GFX90A-LABEL: local_atomic_fadd_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB63_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NEXT: ds_add_f64 v2, v[0:1] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB63_2: -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: local_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB63_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX940-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NEXT: ds_add_f64 v2, v[0:1] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB63_2: -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret void -} - -define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { -; GFX90A-LABEL: local_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: local_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret double %ret -} - define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB65_2 +; GFX90A-NEXT: s_cbranch_execz .LBB63_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB65_2: +; GFX90A-NEXT: .LBB63_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB65_2 +; GFX940-NEXT: s_cbranch_execz .LBB63_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB65_2: +; GFX940-NEXT: .LBB63_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2045,91 +1972,91 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB66_2 +; GFX90A-NEXT: s_cbranch_execz .LBB64_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB66_2: +; GFX90A-NEXT: .LBB64_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB66_2 +; GFX940-NEXT: s_cbranch_execz .LBB64_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB66_2: +; GFX940-NEXT: .LBB64_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void } -define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { +define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #2 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB67_2 +; GFX90A-NEXT: s_cbranch_execz .LBB65_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB67_2: +; GFX90A-NEXT: .LBB65_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB67_2 +; GFX940-NEXT: s_cbranch_execz .LBB65_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB67_2: +; GFX940-NEXT: .LBB65_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2158,54 +2085,6 @@ main_body: ret double %ret } -define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, double %data) #2 { -; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret double %ret -} - -define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double %data) #3 { -; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_safe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret double %ret -} - attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" } -attributes #2 = { "denormal-fp-math"="ieee,ieee" "amdgpu-unsafe-fp-atomics"="true" } -attributes #3 = { "denormal-fp-math"="ieee,ieee" } -attributes #4 = { "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #2 = { "denormal-fp-math"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index e051cc28469fae..05cdb54f5dd747 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2 @@ -37,8 +37,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8 @@ -67,8 +67,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2 @@ -87,8 +87,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8 @@ -113,8 +113,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2 @@ -133,8 +133,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8 @@ -159,8 +159,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -188,8 +188,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -225,8 +225,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -243,8 +243,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -269,8 +269,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { ; CI-LABEL: unsafe_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -287,8 +287,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -313,8 +313,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -341,8 +341,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -376,8 +376,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -401,8 +401,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -433,8 +433,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; CI-LABEL: unsafe_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -458,8 +458,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -491,8 +491,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -545,8 +545,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -588,8 +588,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 @@ -682,8 +682,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 @@ -747,8 +747,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 @@ -792,8 +792,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 @@ -845,8 +845,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 @@ -922,8 +922,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 @@ -1007,8 +1007,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 @@ -1050,8 +1050,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll index fe2e7afb7048ed..388ef2497e4356 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll @@ -952,9 +952,9 @@ define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %ar ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile load (s8) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p1) :: (volatile load (s32) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CHECK-NEXT: %12:_(p5) = nuw nusw G_PTR_ADD [[COPY]], [[C]](s32) + ; CHECK-NEXT: %5:_(p5) = nuw nusw G_PTR_ADD [[COPY]], [[C]](s32) ; CHECK-NEXT: G_STORE [[LOAD]](s8), [[COPY]](p5) :: (store (s8) into %ir.arg0, addrspace 5) - ; CHECK-NEXT: G_STORE [[LOAD1]](s32), %12(p5) :: (store (s32) into %ir.gep1, addrspace 5) + ; CHECK-NEXT: G_STORE [[LOAD1]](s32), %5(p5) :: (store (s32) into %ir.gep1, addrspace 5) ; CHECK-NEXT: SI_RETURN %val0 = load volatile i8, ptr addrspace(1) undef %val1 = load volatile i32, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 9443b39dcdc033..8859ac69923a99 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 ; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_mov_b32 s4, s0 @@ -35,8 +35,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 ; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_mov_b32 s4, s0 @@ -59,7 +59,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 @@ -83,7 +83,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 @@ -114,9 +114,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 @@ -127,9 +127,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xcc ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -140,7 +140,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 @@ -152,7 +152,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 @@ -170,9 +170,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 @@ -183,9 +183,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -196,7 +196,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 @@ -208,7 +208,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 @@ -226,12 +226,12 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { define amdgpu_kernel void @llvm_trap() { ; GFX8V4-LABEL: llvm_trap: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8V4-NEXT: s_trap 2 ; ; GFX8V5-LABEL: llvm_trap: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_trap 2 ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll index 696cbdb75f1ed9..136c51d775b43c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll @@ -19,9 +19,6 @@ define amdgpu_kernel void @return_type_is_too_big_vector() { ; CHECK-LABEL: name: return_type_is_too_big_vector ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1 (%ir-block.0): ; CHECK-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index db944b98a30135..a1c99f5cf60297 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -7,9 +7,9 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr addrspace(1) %ptr, i32 %val, i32 %idx) #0 { ; GCN-LABEL: v_insert_v64i32_varidx: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[20:23], s[6:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[24:25], s[6:7], 0x10 -; GCN-NEXT: s_add_u32 s0, s0, s13 +; GCN-NEXT: s_load_dwordx4 s[20:23], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index 5185f6c4ada5ba..3abc21f812e145 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addrspace(1) %ptr.out) #0 { ; GCN-LABEL: v_insert_v64i32_37: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] @@ -53,7 +53,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addr ; ; GFX10-LABEL: v_insert_v64i32_37: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0xf @@ -101,9 +101,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addr ; ; GFX11-LABEL: v_insert_v64i32_37: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0xf diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll index e67ada74c23e65..e9292f4e34dcda 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=amdgpu-attributor < %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -o - | FileCheck -check-prefix=HSA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -global-isel %s -o - | FileCheck -check-prefix=HSA %s ; HSA-LABEL: name: default_kernel ; HSA: liveins: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll index 652d22ac1224fc..f2fe815a71202c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll @@ -5,9 +5,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind { ; HSA-VI-LABEL: name: i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -20,9 +20,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; LEGACY-MESA-VI-LABEL: name: i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -40,9 +40,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind { ; HSA-VI-LABEL: name: i8_zext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -55,9 +55,9 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; LEGACY-MESA-VI-LABEL: name: i8_zext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -75,9 +75,9 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind { ; HSA-VI-LABEL: name: i8_sext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -90,9 +90,9 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; LEGACY-MESA-VI-LABEL: name: i8_sext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -110,9 +110,9 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind { ; HSA-VI-LABEL: name: i16_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -125,9 +125,9 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; LEGACY-MESA-VI-LABEL: name: i16_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -145,9 +145,9 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind { ; HSA-VI-LABEL: name: i16_zext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -160,9 +160,9 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; LEGACY-MESA-VI-LABEL: name: i16_zext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -180,9 +180,9 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind { ; HSA-VI-LABEL: name: i16_sext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -195,9 +195,9 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; LEGACY-MESA-VI-LABEL: name: i16_sext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -215,9 +215,9 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind { ; HSA-VI-LABEL: name: i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -229,9 +229,9 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou ; ; LEGACY-MESA-VI-LABEL: name: i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -248,9 +248,9 @@ entry: define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind { ; HSA-VI-LABEL: name: f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -262,9 +262,9 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n ; ; LEGACY-MESA-VI-LABEL: name: f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -281,9 +281,9 @@ entry: define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; HSA-VI-LABEL: name: v2i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -295,9 +295,9 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v2i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -314,9 +314,9 @@ entry: define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; HSA-VI-LABEL: name: v2i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -328,9 +328,9 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v2i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -347,9 +347,9 @@ entry: define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v2i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -361,9 +361,9 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v2i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -380,9 +380,9 @@ entry: define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind { ; HSA-VI-LABEL: name: v2f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -394,9 +394,9 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; LEGACY-MESA-VI-LABEL: name: v2f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -413,9 +413,9 @@ entry: define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { ; HSA-VI-LABEL: name: v3i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -427,9 +427,9 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i ; ; LEGACY-MESA-VI-LABEL: name: v3i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -446,9 +446,9 @@ entry: define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { ; HSA-VI-LABEL: name: v3i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -460,9 +460,9 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; LEGACY-MESA-VI-LABEL: name: v3i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -479,9 +479,9 @@ entry: define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v3i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -493,9 +493,9 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v3i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -512,9 +512,9 @@ entry: define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { ; HSA-VI-LABEL: name: v3f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -526,9 +526,9 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; LEGACY-MESA-VI-LABEL: name: v3f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -545,9 +545,9 @@ entry: define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; HSA-VI-LABEL: name: v4i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -559,9 +559,9 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v4i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -578,9 +578,9 @@ entry: define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; HSA-VI-LABEL: name: v4i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -592,9 +592,9 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v4i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -611,9 +611,9 @@ entry: define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v4i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -625,9 +625,9 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v4i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -644,9 +644,9 @@ entry: define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind { ; HSA-VI-LABEL: name: v4f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -658,9 +658,9 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; LEGACY-MESA-VI-LABEL: name: v4f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -677,9 +677,9 @@ entry: define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; HSA-VI-LABEL: name: v8i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -691,9 +691,9 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v8i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -710,9 +710,9 @@ entry: define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; HSA-VI-LABEL: name: v8i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -724,9 +724,9 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v8i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -743,9 +743,9 @@ entry: define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v8i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -757,9 +757,9 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v8i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -776,9 +776,9 @@ entry: define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind { ; HSA-VI-LABEL: name: v8f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -790,9 +790,9 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; LEGACY-MESA-VI-LABEL: name: v8f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -809,9 +809,9 @@ entry: define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; HSA-VI-LABEL: name: v16i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -823,9 +823,9 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v16i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -842,9 +842,9 @@ entry: define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; HSA-VI-LABEL: name: v16i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -856,9 +856,9 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v16i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -875,9 +875,9 @@ entry: define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v16i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -889,9 +889,9 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; LEGACY-MESA-VI-LABEL: name: v16i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -908,9 +908,9 @@ entry: define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind { ; HSA-VI-LABEL: name: v16f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -922,9 +922,9 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; LEGACY-MESA-VI-LABEL: name: v16f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -941,9 +941,9 @@ entry: define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind { ; HSA-VI-LABEL: name: kernel_arg_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -955,9 +955,9 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; LEGACY-MESA-VI-LABEL: name: kernel_arg_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -973,9 +973,9 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; HSA-VI-LABEL: name: f64_kernel_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -987,9 +987,9 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; LEGACY-MESA-VI-LABEL: name: f64_kernel_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1006,9 +1006,9 @@ entry: define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1020,9 +1020,9 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: i1_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1038,9 +1038,9 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_zext_i32 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1053,9 +1053,9 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i32 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1073,9 +1073,9 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_zext_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1088,9 +1088,9 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1108,9 +1108,9 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_sext_i32 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1123,9 +1123,9 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i32 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1143,9 +1143,9 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_sext_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1158,9 +1158,9 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1180,9 +1180,9 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { ; HSA-VI-LABEL: name: empty_struct_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1192,9 +1192,9 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: empty_struct_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1208,9 +1208,9 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { ; HSA-VI-LABEL: name: empty_array_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1220,9 +1220,9 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: empty_array_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1244,9 +1244,9 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, {i32, i64} %arg1) { ; HSA-VI-LABEL: name: struct_argument_alignment ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1272,9 +1272,9 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, ; ; LEGACY-MESA-VI-LABEL: name: struct_argument_alignment ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1312,9 +1312,9 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr addrspace(1)} %arg0, i8 %pad, {ptr addrspace(3), ptr addrspace(1234)} %arg1) { ; HSA-VI-LABEL: name: pointer_in_struct_argument ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1340,9 +1340,9 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add ; ; LEGACY-MESA-VI-LABEL: name: pointer_in_struct_argument ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1382,9 +1382,9 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { ; HSA-VI-LABEL: name: packed_struct_argument_alignment ; HSA-VI: bb.1 (%ir-block.1): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1406,9 +1406,9 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; ; LEGACY-MESA-VI-LABEL: name: packed_struct_argument_alignment ; LEGACY-MESA-VI: bb.1 (%ir-block.1): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1441,16 +1441,16 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, define amdgpu_kernel void @unused_i32_arg(ptr addrspace(1) nocapture %out, i32 %unused, i32 %in) nounwind { ; HSA-VI-LABEL: name: unused_i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: S_ENDPGM 0 ; ; LEGACY-MESA-VI-LABEL: name: unused_i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: S_ENDPGM 0 entry: ret void @@ -1460,9 +1460,9 @@ entry: define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1475,9 +1475,9 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1496,9 +1496,9 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) align 2 %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i16_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1511,9 +1511,9 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i16_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1532,9 +1532,9 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align 4 %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1550,9 +1550,9 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1574,9 +1574,9 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) align(16) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_constant_v4i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1592,9 +1592,9 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture % ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_v4i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1616,9 +1616,9 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture % define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_align_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1634,9 +1634,9 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: byref_align_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1658,9 +1658,9 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg ; HSA-VI: bb.1 (%ir-block.1): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1676,9 +1676,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; ; LEGACY-MESA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.1): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1701,9 +1701,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_global_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1716,9 +1716,9 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ; ; LEGACY-MESA-VI-LABEL: name: byref_global_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1736,9 +1736,9 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_flat_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1751,9 +1751,9 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p ; ; LEGACY-MESA-VI-LABEL: name: byref_flat_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1771,9 +1771,9 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_32bit_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1786,9 +1786,9 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_32bit_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1806,9 +1806,9 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(999) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_unknown_as_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1821,9 +1821,9 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture % ; ; LEGACY-MESA-VI-LABEL: name: byref_unknown_as_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1842,9 +1842,9 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture % define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(3) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_local_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1857,9 +1857,9 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, ; ; LEGACY-MESA-VI-LABEL: name: byref_local_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1877,9 +1877,9 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(4) %in0.byref, ptr addrspace(4) byref(i32) align(4) %in1.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: multi_byref_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1899,9 +1899,9 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: multi_byref_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1929,9 +1929,9 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i32_arg_offset0 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -1941,9 +1941,9 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg_offset0 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -1958,9 +1958,9 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { ; HSA-VI-LABEL: name: p3i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3), align 16, addrspace 4) @@ -1970,9 +1970,9 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: p3i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3), addrspace 4) @@ -1986,9 +1986,9 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { ; HSA-VI-LABEL: name: p1i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 9 ; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0 ; HSA-VI-NEXT: G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3) @@ -1996,9 +1996,9 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: p1i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 9 ; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0 ; LEGACY-MESA-VI-NEXT: G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3) @@ -2010,9 +2010,9 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { ; HSA-VI-LABEL: name: v2p1i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>), addrspace 4) @@ -2022,9 +2022,9 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: v2p1i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>), align 4, addrspace 4) @@ -2038,9 +2038,9 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { ; HSA-VI-LABEL: name: v2p3i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>), align 16, addrspace 4) @@ -2050,9 +2050,9 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: v2p3i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>), align 4, addrspace 4) @@ -2066,9 +2066,9 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x ptr addrspace(3)> } %arg) nounwind { ; HSA-VI-LABEL: name: v2p1i8_in_struct_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>), addrspace 4) @@ -2084,9 +2084,9 @@ define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x p ; ; LEGACY-MESA-VI-LABEL: name: v2p1i8_in_struct_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll index 6b0e9618754df8..eebbe20abd043e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll @@ -4,9 +4,6 @@ define amdgpu_kernel void @system_one_as_acquire() { ; CHECK-LABEL: name: system_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") acquire @@ -16,9 +13,6 @@ define amdgpu_kernel void @system_one_as_acquire() { define amdgpu_kernel void @system_one_as_release() { ; CHECK-LABEL: name: system_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") release @@ -28,9 +22,6 @@ define amdgpu_kernel void @system_one_as_release() { define amdgpu_kernel void @system_one_as_acq_rel() { ; CHECK-LABEL: name: system_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") acq_rel @@ -40,9 +31,6 @@ define amdgpu_kernel void @system_one_as_acq_rel() { define amdgpu_kernel void @system_one_as_seq_cst() { ; CHECK-LABEL: name: system_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") seq_cst @@ -52,9 +40,6 @@ define amdgpu_kernel void @system_one_as_seq_cst() { define amdgpu_kernel void @singlethread_one_as_acquire() { ; CHECK-LABEL: name: singlethread_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") acquire @@ -64,9 +49,6 @@ define amdgpu_kernel void @singlethread_one_as_acquire() { define amdgpu_kernel void @singlethread_one_as_release() { ; CHECK-LABEL: name: singlethread_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") release @@ -76,9 +58,6 @@ define amdgpu_kernel void @singlethread_one_as_release() { define amdgpu_kernel void @singlethread_one_as_acq_rel() { ; CHECK-LABEL: name: singlethread_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") acq_rel @@ -88,9 +67,6 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel() { define amdgpu_kernel void @singlethread_one_as_seq_cst() { ; CHECK-LABEL: name: singlethread_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") seq_cst @@ -100,9 +76,6 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst() { define amdgpu_kernel void @agent_one_as_acquire() { ; CHECK-LABEL: name: agent_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") acquire @@ -112,9 +85,6 @@ define amdgpu_kernel void @agent_one_as_acquire() { define amdgpu_kernel void @agent_one_as_release() { ; CHECK-LABEL: name: agent_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") release @@ -124,9 +94,6 @@ define amdgpu_kernel void @agent_one_as_release() { define amdgpu_kernel void @agent_one_as_acq_rel() { ; CHECK-LABEL: name: agent_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") acq_rel @@ -136,9 +103,6 @@ define amdgpu_kernel void @agent_one_as_acq_rel() { define amdgpu_kernel void @agent_one_as_seq_cst() { ; CHECK-LABEL: name: agent_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") seq_cst @@ -148,9 +112,6 @@ define amdgpu_kernel void @agent_one_as_seq_cst() { define amdgpu_kernel void @workgroup_one_as_acquire() { ; CHECK-LABEL: name: workgroup_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") acquire @@ -160,9 +121,6 @@ define amdgpu_kernel void @workgroup_one_as_acquire() { define amdgpu_kernel void @workgroup_one_as_release() { ; CHECK-LABEL: name: workgroup_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") release @@ -172,9 +130,6 @@ define amdgpu_kernel void @workgroup_one_as_release() { define amdgpu_kernel void @workgroup_one_as_acq_rel() { ; CHECK-LABEL: name: workgroup_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") acq_rel @@ -184,9 +139,6 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() { define amdgpu_kernel void @workgroup_one_as_seq_cst() { ; CHECK-LABEL: name: workgroup_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") seq_cst @@ -196,9 +148,6 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() { define amdgpu_kernel void @wavefront_one_as_acquire() { ; CHECK-LABEL: name: wavefront_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") acquire @@ -208,9 +157,6 @@ define amdgpu_kernel void @wavefront_one_as_acquire() { define amdgpu_kernel void @wavefront_one_as_release() { ; CHECK-LABEL: name: wavefront_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") release @@ -220,9 +166,6 @@ define amdgpu_kernel void @wavefront_one_as_release() { define amdgpu_kernel void @wavefront_one_as_acq_rel() { ; CHECK-LABEL: name: wavefront_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") acq_rel @@ -232,9 +175,6 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel() { define amdgpu_kernel void @wavefront_one_as_seq_cst() { ; CHECK-LABEL: name: wavefront_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") seq_cst @@ -244,9 +184,6 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst() { define amdgpu_kernel void @system_acquire() { ; CHECK-LABEL: name: system_acquire ; CHECK: bb.1.entry: - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: S_ENDPGM 0 entry: ret void @@ -255,9 +192,6 @@ entry: define amdgpu_kernel void @system_release() { ; CHECK-LABEL: name: system_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 1 ; CHECK-NEXT: S_ENDPGM 0 fence release @@ -267,9 +201,6 @@ define amdgpu_kernel void @system_release() { define amdgpu_kernel void @system_acq_rel() { ; CHECK-LABEL: name: system_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 1 ; CHECK-NEXT: S_ENDPGM 0 fence acq_rel @@ -279,9 +210,6 @@ define amdgpu_kernel void @system_acq_rel() { define amdgpu_kernel void @system_seq_cst() { ; CHECK-LABEL: name: system_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 1 ; CHECK-NEXT: S_ENDPGM 0 fence seq_cst @@ -291,9 +219,6 @@ define amdgpu_kernel void @system_seq_cst() { define amdgpu_kernel void @singlethread_acquire() { ; CHECK-LABEL: name: singlethread_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") acquire @@ -303,9 +228,6 @@ define amdgpu_kernel void @singlethread_acquire() { define amdgpu_kernel void @singlethread_release() { ; CHECK-LABEL: name: singlethread_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") release @@ -315,9 +237,6 @@ define amdgpu_kernel void @singlethread_release() { define amdgpu_kernel void @singlethread_acq_rel() { ; CHECK-LABEL: name: singlethread_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") acq_rel @@ -327,9 +246,6 @@ define amdgpu_kernel void @singlethread_acq_rel() { define amdgpu_kernel void @singlethread_seq_cst() { ; CHECK-LABEL: name: singlethread_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") seq_cst @@ -339,9 +255,6 @@ define amdgpu_kernel void @singlethread_seq_cst() { define amdgpu_kernel void @agent_acquire() { ; CHECK-LABEL: name: agent_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") acquire @@ -351,9 +264,6 @@ define amdgpu_kernel void @agent_acquire() { define amdgpu_kernel void @agent_release() { ; CHECK-LABEL: name: agent_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") release @@ -363,9 +273,6 @@ define amdgpu_kernel void @agent_release() { define amdgpu_kernel void @agent_acq_rel() { ; CHECK-LABEL: name: agent_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") acq_rel @@ -375,9 +282,6 @@ define amdgpu_kernel void @agent_acq_rel() { define amdgpu_kernel void @agent_seq_cst() { ; CHECK-LABEL: name: agent_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") seq_cst @@ -387,9 +291,6 @@ define amdgpu_kernel void @agent_seq_cst() { define amdgpu_kernel void @workgroup_acquire() { ; CHECK-LABEL: name: workgroup_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") acquire @@ -399,9 +300,6 @@ define amdgpu_kernel void @workgroup_acquire() { define amdgpu_kernel void @workgroup_release() { ; CHECK-LABEL: name: workgroup_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") release @@ -411,9 +309,6 @@ define amdgpu_kernel void @workgroup_release() { define amdgpu_kernel void @workgroup_acq_rel() { ; CHECK-LABEL: name: workgroup_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") acq_rel @@ -423,9 +318,6 @@ define amdgpu_kernel void @workgroup_acq_rel() { define amdgpu_kernel void @workgroup_seq_cst() { ; CHECK-LABEL: name: workgroup_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") seq_cst @@ -435,9 +327,6 @@ define amdgpu_kernel void @workgroup_seq_cst() { define amdgpu_kernel void @wavefront_acquire() { ; CHECK-LABEL: name: wavefront_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") acquire @@ -447,9 +336,6 @@ define amdgpu_kernel void @wavefront_acquire() { define amdgpu_kernel void @wavefront_release() { ; CHECK-LABEL: name: wavefront_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") release @@ -459,9 +345,6 @@ define amdgpu_kernel void @wavefront_release() { define amdgpu_kernel void @wavefront_acq_rel() { ; CHECK-LABEL: name: wavefront_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") acq_rel @@ -471,9 +354,6 @@ define amdgpu_kernel void @wavefront_acq_rel() { define amdgpu_kernel void @wavefront_seq_cst() { ; CHECK-LABEL: name: wavefront_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll index 8813462652efdb..ecad793ad58987 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll @@ -810,14 +810,14 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -932,14 +932,14 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX35]], [[C3]](s32) ; GCN-NEXT: G_STORE [[C1]](s64), [[PTR_ADD2]](p5) :: (store (s64) into %ir.alloca1 + 8, addrspace 5) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @void_fastcc_multi_byval - ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY48:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[FRAME_INDEX36:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 @@ -978,14 +978,14 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -1096,14 +1096,14 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C3]](s32) ; GCN-NEXT: G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into %ir.alloca + 8, addrspace 5) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @void_fastcc_byval_and_stack_passed - ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY48:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[FRAME_INDEX35:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 @@ -1170,26 +1170,26 @@ define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @i64_fastcc_i64 - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64) ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1211,23 +1211,50 @@ entry: ret i64 %ret } -declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0) #1 +declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0) define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 { ; GCN-LABEL: name: sibling_call_p1i8_fastcc_p1i8 ; GCN: bb.1.entry: - ; GCN-NEXT: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @p1i8_fastcc_p1i8 + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p1) ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>) - ; GCN-NEXT: SI_TCRETURN [[GV]](p0), @p1i8_fastcc_p1i8, 0, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY12]](p4) + ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[COPY13]](p4) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY14]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY17]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[COPY18]](s32) + ; GCN-NEXT: $vgpr31 = COPY [[COPY19]](s32) + ; GCN-NEXT: SI_TCRETURN [[GV]](p0), @p1i8_fastcc_p1i8, 0, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 entry: %ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a) ret ptr addrspace(1) %ret @@ -1241,25 +1268,25 @@ define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @i16_fastcc_i16 - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) ; GCN-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -1288,25 +1315,25 @@ define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @f16_fastcc_f16 - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) ; GCN-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -1335,28 +1362,28 @@ define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v3i16_fastcc_v3i16 - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s16>) ; GCN-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF @@ -1389,26 +1416,26 @@ define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v4i16_fastcc_v4i16 - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) ; GCN-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) @@ -1438,14 +1465,14 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -1454,14 +1481,14 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 ; GCN-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY11]](s32), [[COPY12]](s32) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v2i64_fastcc_v2i64 - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>) ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1486,7 +1513,7 @@ entry: } attributes #0 = { nounwind } -attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { nounwind noinline } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index c3938e673a6da6..2f718814ef77b5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CHECK-LABEL: use_lds_globals: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 ; CHECK-NEXT: ds_read_b32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll index b8b7256011df89..7587aa0cad2d4f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel < %s 2>&1 | FileCheck %s @@ -11,25 +11,25 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p) { ; GCN-LABEL: name: load_zeroinit_lds_global ; GCN: bb.1 (%ir-block.0): - ; GCN: liveins: $sgpr2_sgpr3 - ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 40 + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 40 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @lds - ; GFX8: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit-def dead $scc - ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 - ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]] + ; GFX6: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit-def dead $scc + ; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 + ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]] ; GCN: $m0 = S_MOV_B32 -1 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX8: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 0, 0, implicit $m0, implicit $exec - ; GFX9: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 40, 0, implicit $m0, implicit $exec - ; GFX8: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 - ; GFX8: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 - ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_LOAD_DWORDX2_IMM]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX8: BUFFER_STORE_DWORD_OFFSET [[DS_READ_B32_]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec - ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] - ; GFX9: FLAT_STORE_DWORD [[COPY2]], [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 0, 0, implicit $m0, implicit $exec + ; GFX8: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 40, 0, implicit $m0, implicit $exec + ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GFX6: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_LOAD_DWORDX2_IMM]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: BUFFER_STORE_DWORD_OFFSET [[DS_READ_B32_]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec + ; GFX8: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX8: FLAT_STORE_DWORD [[COPY2]], [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr ; GCN: S_ENDPGM 0 %gep = getelementptr [256 x i32], ptr addrspace(3) @lds, i32 0, i32 10 %ld = load i32, ptr addrspace(3) %gep diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index 90f34acaa17aae..1a49a38158122e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f32_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -48,7 +48,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f32_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -62,9 +62,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_div_scale_f32_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -93,7 +91,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -112,7 +110,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f32_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -133,7 +131,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f32_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -147,9 +145,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_div_scale_f32_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -178,7 +174,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f64_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -188,7 +184,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -197,11 +193,12 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f64_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 @@ -210,9 +207,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[2:3], v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -220,35 +215,31 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f64_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[2:3], v[2:3], v[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -269,7 +260,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f64_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -279,7 +270,7 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -288,11 +279,12 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f64_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 @@ -301,9 +293,7 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -311,35 +301,31 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f64_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -360,8 +346,8 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], float %a) { ; GFX7-LABEL: test_div_scale_f32_scalar_num_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -378,8 +364,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_num_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -396,10 +382,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -410,12 +395,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_num_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x54 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x54 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -438,8 +420,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %a) { ; GFX7-LABEL: test_div_scale_f32_scalar_num_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -456,8 +438,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_num_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -474,10 +456,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -488,12 +469,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_num_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -516,8 +494,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) { ; GFX7-LABEL: test_div_scale_f32_scalar_den_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -534,8 +512,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_den_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -552,10 +530,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -566,12 +543,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_den_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -594,8 +568,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) { ; GFX7-LABEL: test_div_scale_f32_scalar_den_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -612,8 +586,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_den_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -630,10 +604,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -644,12 +617,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_den_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -672,8 +642,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %a) { ; GFX7-LABEL: test_div_scale_f64_scalar_num_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -690,8 +660,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_num_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -708,10 +678,9 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -722,13 +691,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_num_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -751,8 +717,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %a) { ; GFX7-LABEL: test_div_scale_f64_scalar_num_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -769,8 +735,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_num_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -787,10 +753,9 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -801,13 +766,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_num_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -830,8 +792,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_scalar_den_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -848,8 +810,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_den_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -866,10 +828,9 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -880,13 +841,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_den_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -909,8 +867,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_scalar_den_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -927,8 +885,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_den_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -945,10 +903,9 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -959,13 +916,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_den_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -988,26 +942,25 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) { ; GFX7-LABEL: test_div_scale_f32_all_scalar_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x1c -; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s4 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_all_scalar_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x70 -; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1016,24 +969,24 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f32_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s5, s5, s4 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_scale_f32 v0, s0, s5, s5, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_all_scalar_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s4 +; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1047,26 +1000,25 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) { ; GFX7-LABEL: test_div_scale_f32_all_scalar_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x1c -; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s5, v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s4, v0, s4 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_all_scalar_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x70 -; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s1, v0, s1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s3, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1075,24 +1027,24 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f32_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, s5, s4 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s5, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_all_scalar_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s5, s4 +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1106,13 +1058,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_all_scalar_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1120,14 +1072,13 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[4:5] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1136,26 +1087,24 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f64_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[4:5], s[4:5], s[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_all_scalar_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x74 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[0:1] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1169,13 +1118,13 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_all_scalar_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[6:7], v[0:1], s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1183,14 +1132,13 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[4:5], v[0:1], s[4:5] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1199,26 +1147,24 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f64_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_all_scalar_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x74 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[4:5], s[0:1] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[4:5], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1232,7 +1178,7 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_inline_imm_num: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1249,7 +1195,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX8-LABEL: test_div_scale_f32_inline_imm_num: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1266,7 +1212,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX10-LABEL: test_div_scale_f32_inline_imm_num: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1278,10 +1224,8 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX11-LABEL: test_div_scale_f32_inline_imm_num: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1303,7 +1247,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_inline_imm_den: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1320,7 +1264,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX8-LABEL: test_div_scale_f32_inline_imm_den: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1337,7 +1281,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX10-LABEL: test_div_scale_f32_inline_imm_den: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1349,10 +1293,8 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX11-LABEL: test_div_scale_f32_inline_imm_den: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1374,7 +1316,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_fabs_num: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1394,7 +1336,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: test_div_scale_f32_fabs_num: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1416,7 +1358,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: test_div_scale_f32_fabs_num: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1431,9 +1373,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: test_div_scale_f32_fabs_num: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1466,7 +1406,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_fabs_den: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1486,7 +1426,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: test_div_scale_f32_fabs_den: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1508,7 +1448,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: test_div_scale_f32_fabs_den: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1523,9 +1463,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: test_div_scale_f32_fabs_den: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1558,7 +1496,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_val_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, v0 @@ -1570,8 +1508,8 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; GFX8-LABEL: test_div_scale_f32_val_undef_val: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, v0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1580,7 +1518,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f32_val_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000 @@ -1589,7 +1527,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_div_scale_f32_val_undef_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000 @@ -1606,7 +1544,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_undef_val_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, s0 @@ -1618,8 +1556,8 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; GFX8-LABEL: test_div_scale_f32_undef_val_val: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1628,7 +1566,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f32_undef_val_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0 @@ -1637,7 +1575,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_div_scale_f32_undef_val_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0 @@ -1654,7 +1592,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_undef_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, s0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1664,8 +1602,8 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; ; GFX8-LABEL: test_div_scale_f32_undef_undef_val: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1674,7 +1612,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; ; GFX10-LABEL: test_div_scale_f32_undef_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0 @@ -1683,7 +1621,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; ; GFX11-LABEL: test_div_scale_f32_undef_undef_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0 @@ -1700,7 +1638,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f64_val_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x40200000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1714,8 +1652,8 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x40200000 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1724,8 +1662,8 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f64_val_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], 0x40200000 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], 0x40200000 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1734,7 +1672,7 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; GFX11-LABEL: test_div_scale_f64_val_undef_val: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], 0x40200000 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll index 2a260823732ca9..d7b7f03d428bfb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-LABEL: test_wave32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB0_2 @@ -14,7 +14,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: .LBB0_2: ; %bb -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x24 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -25,16 +25,16 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; ; GFX11-LABEL: test_wave32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX11-NEXT: ; %bb.1: ; %mid ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB0_2: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll index 06393857352b3a..81d8472ebd46ef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) { ; GCN-LABEL: test_wave64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 @@ -13,7 +13,7 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) { ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB0_2: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll index 59818b0b1bc39b..ade6e55b482bb7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -134,8 +134,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -147,10 +147,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -160,7 +160,7 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN @@ -179,8 +179,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -190,16 +190,16 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll index de91c45000f137..752ddbb896c6b1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -61,8 +61,8 @@ define void @global_atomic_fadd_f32_off_neg2047(ptr addrspace(1) %ptr, float %da define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(ptr addrspace(1) %ptr, float %data) { ; GFX908-LABEL: global_atomic_fadd_f32_off_ss: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v0, s2 @@ -71,8 +71,8 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(ptr addrspace(1) %ptr, ; ; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll index ec069c10a8d212..1e0cbde7df0dbf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-LABEL: test_wave32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s1, s[6:7], 0x24 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s1, s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 @@ -22,14 +22,14 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX11-LABEL: test_wave32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s0, 0 -; GFX11-NEXT: s_cselect_b32 s0, 1, 0 -; GFX11-NEXT: s_and_b32 s0, 1, s0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_cmp_eq_u32 s2, 0 +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 +; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll index d7a82b415ff06c..9718cef5c6db0e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) { ; GCN-LABEL: test_wave64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[6:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xa +; GCN-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 69f9a5712b0b5a..546376c5962be7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 @@ -658,7 +658,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 @@ -688,33 +688,33 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 -; GFX11-NEXT: s_mov_b32 s9, 4.0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 +; GFX11-NEXT: s_mov_b32 s9, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 ; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -742,7 +742,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 @@ -769,7 +769,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) @@ -796,29 +796,28 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -847,8 +846,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -876,8 +875,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -889,8 +888,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 +; GFX1013-NEXT: v_mov_b32_e32 v0, s2 +; GFX1013-NEXT: v_mov_b32_e32 v1, s3 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: flat_load_dword v2, v[0:1] @@ -904,30 +903,29 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s16, 0xb36211c7 -; GFX11-NEXT: s_mov_b32 s6, 2.0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_movk_i32 s17, 0x102 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v9, s16 :: v_dual_lshlrev_b32 v2, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 -; GFX11-NEXT: v_dual_mov_b32 v10, s17 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_mov_b32_e32 v4, s9 -; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, s16 +; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_mov_b32_e32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b32 s5, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v10, s17 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_load_b32 v11, v[0:1] @@ -959,8 +957,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -985,8 +983,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -995,8 +993,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 +; GFX1013-NEXT: v_mov_b32_e32 v0, s2 +; GFX1013-NEXT: v_mov_b32_e32 v1, s3 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: flat_load_dword v2, v[0:1] @@ -1010,24 +1008,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s12, 0xb36211c6 -; GFX11-NEXT: s_mov_b32 s6, 2.0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_movk_i32 s13, 0x102 +; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v3, s8 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s5, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_load_b32 v8, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll index b0c6e89380d810..5c22d5bdcf7449 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-LABEL: is_private_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x32 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x32 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -26,7 +26,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -39,7 +39,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX10-LABEL: is_private_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -53,14 +53,13 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX11-LABEL: is_private_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -79,9 +78,9 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; CI-LABEL: is_private_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s0, s[6:7], 0x32 +; CI-NEXT: s_load_dword s0, s[4:5], 0x32 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 ; CI-NEXT: s_cbranch_scc1 .LBB1_2 @@ -94,7 +93,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX9-LABEL: is_private_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s1, s3 @@ -108,7 +107,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX10-LABEL: is_private_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s1, s3 @@ -122,7 +121,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX11-LABEL: is_private_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll index bbcb807a956bee..e005c38355a3ce 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-LABEL: is_local_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x33 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x33 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -26,7 +26,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -39,7 +39,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX10-LABEL: is_local_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -53,14 +53,13 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX11-LABEL: is_local_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -79,9 +78,9 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; CI-LABEL: is_local_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s0, s[6:7], 0x33 +; CI-NEXT: s_load_dword s0, s[4:5], 0x33 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 ; CI-NEXT: s_cbranch_scc1 .LBB1_2 @@ -94,7 +93,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX9-LABEL: is_local_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s1, s3 @@ -108,7 +107,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX10-LABEL: is_local_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s1, s3 @@ -122,7 +121,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX11-LABEL: is_local_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll index 1676b69c8c6318..7fc9842824b01d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll @@ -4,9 +4,9 @@ ; ALL-LABEL: {{^}}test: ; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1 -; CO-V4: s_load_dword s{{[0-9]+}}, s[8:9], 0xa +; CO-V4: s_load_dword s{{[0-9]+}}, s[4:5], 0xa -; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[4:5], 0xa +; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa ; HSA: .amdhsa_kernarg_size 8 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 1 @@ -81,7 +81,7 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(ptr addrspace(1) %out, ; HSA: .amdhsa_kernarg_size 0 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -define amdgpu_kernel void @test_no_kernargs() #4 { +define amdgpu_kernel void @test_no_kernargs() #1 { %kernarg.segment.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() %gep = getelementptr i32, ptr addrspace(4) %kernarg.segment.ptr, i64 10 %value = load i32, ptr addrspace(4) %gep @@ -126,7 +126,6 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind "amdgpu-implicitarg-num-bytes"="0" } attributes #2 = { nounwind "amdgpu-implicitarg-num-bytes"="48" } attributes #3 = { nounwind "amdgpu-implicitarg-num-bytes"="38" } -attributes #4 = { nounwind "amdgpu-implicitarg-num-bytes"="0" "amdgpu-no-implicitarg-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index 4d012796693cb6..e7faabb72ab691 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_32x32x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[36:37], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s36, 2 @@ -81,7 +81,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_16x16x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 @@ -127,7 +127,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[6:7], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s6, 2 @@ -157,7 +157,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_32x32x8bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 @@ -204,7 +204,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[6:7], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s6, 2 @@ -235,11 +235,11 @@ bb: define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_4x4x4f64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 ; GCN-NEXT: s_nop 3 @@ -258,8 +258,8 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -292,11 +292,11 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 @@ -317,8 +317,8 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b64 s[6:7], 1.0 ; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] @@ -352,9 +352,9 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN-NEXT: s_mov_b32 s5, 0x405ec000 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll index aa21e67544d65d..c0cd0686072002 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -8,10 +8,10 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 @@ -22,22 +22,22 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; encoding: [0x01,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; encoding: [0x01,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; encoding: [0x00,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x00,0x00] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x02,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; encoding: [0x01,0x01,0x00,0xf4,0x2c,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; encoding: [0x01,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; encoding: [0x80,0x00,0x10,0xca,0x04,0x00,0x00,0x01] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; encoding: [0x80,0x00,0x10,0xca,0x02,0x00,0x00,0x01] ; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] @@ -50,7 +50,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; GFX8-LABEL: mov_dpp64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -63,7 +63,7 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; ; GFX10-LABEL: mov_dpp64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; encoding: [0x01,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] @@ -75,7 +75,7 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; ; GFX11-LABEL: mov_dpp64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; encoding: [0x01,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; encoding: [0x02,0x00,0x10,0xca,0x03,0x00,0x00,0x00] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll index dd351e193e9e6e..1eb0c2a8774258 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll @@ -2,20 +2,10 @@ ; FIXME: Error on non-hsa target -; GCN-LABEL: {{^}}queue_ptr: -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 -; GCN: .amdhsa_user_sgpr_queue_ptr 1 -define amdgpu_kernel void @queue_ptr(ptr addrspace(1) %out) { - %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 - %value = load i32, ptr addrspace(4) %queue_ptr - store i32 %value, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}queue_ptr_opt: +; GCN-LABEL: {{^}}test: ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN: .amdhsa_user_sgpr_queue_ptr 1 -define amdgpu_kernel void @queue_ptr_opt(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test(ptr addrspace(1) %out) { %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 %value = load i32, ptr addrspace(4) %queue_ptr store i32 %value, ptr addrspace(1) %out @@ -25,7 +15,6 @@ define amdgpu_kernel void @queue_ptr_opt(ptr addrspace(1) %out) #1 { declare noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 attributes #0 = { nounwind readnone } -attributes #1 = { "amdgpu-no-dispatch-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 5a4b4e62bd8ae5..6d4aa3b04d7612 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -44,7 +44,7 @@ define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i3 define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 @@ -62,7 +62,7 @@ define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s3, 59, s3 @@ -80,7 +80,7 @@ define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -99,7 +99,7 @@ define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @v_bfe_print_arg(ptr addrspace(1) %out, ptr addrspace(1) %src0) #0 { ; GFX6-LABEL: v_bfe_print_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -118,7 +118,7 @@ define amdgpu_kernel void @v_bfe_print_arg(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_i32 s3, s2, s3 @@ -135,11 +135,11 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_0_width_imm_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s3, s4, 8 +; GFX6-NEXT: s_bfe_i32 s3, s3, 8 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_i32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -173,7 +173,7 @@ define amdgpu_kernel void @bfe_i32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -194,7 +194,7 @@ define amdgpu_kernel void @bfe_i32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -215,7 +215,7 @@ define amdgpu_kernel void @bfe_i32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -234,7 +234,7 @@ define amdgpu_kernel void @bfe_i32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -253,7 +253,7 @@ define amdgpu_kernel void @bfe_i32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -272,7 +272,7 @@ define amdgpu_kernel void @bfe_i32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -291,7 +291,7 @@ define amdgpu_kernel void @bfe_i32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -311,7 +311,7 @@ define amdgpu_kernel void @bfe_i32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -331,7 +331,7 @@ define amdgpu_kernel void @bfe_i32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -347,7 +347,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0x302e, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -363,7 +363,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -379,7 +379,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, -1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -411,7 +411,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x10007 ; GFX6-NEXT: s_bfe_i32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -428,7 +428,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_i32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -445,7 +445,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_i32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -462,7 +462,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80006 ; GFX6-NEXT: s_bfe_i32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -479,7 +479,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80010 ; GFX6-NEXT: s_bfe_i32 s2, 0x10000, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -496,7 +496,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_i32 s2, 0xffff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -513,7 +513,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x40004 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -530,7 +530,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -547,7 +547,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_i32 s2, 0x1fffe, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -564,7 +564,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1e0002 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -581,7 +581,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1c0004 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -598,7 +598,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, -1, 0x70001 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -614,7 +614,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_17: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1f0001 ; GFX6-NEXT: s_bfe_i32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -631,7 +631,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_18: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_i32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -648,7 +648,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_sext_in_reg_i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -670,7 +670,7 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: simplify_demanded_bfe_sdiv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -694,7 +694,7 @@ define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr define amdgpu_kernel void @bfe_0_width(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_0_width: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -713,7 +713,7 @@ define amdgpu_kernel void @bfe_0_width(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @bfe_8_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -734,7 +734,7 @@ define amdgpu_kernel void @bfe_8_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @bfe_8_bfe_16(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -756,7 +756,7 @@ define amdgpu_kernel void @bfe_8_bfe_16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @bfe_16_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_16_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -778,7 +778,7 @@ define amdgpu_kernel void @bfe_16_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 @@ -799,7 +799,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(ptr addrspace(1) %out, i32 define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe_wrong: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, s3, 8 @@ -820,7 +820,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(ptr addrspace(1) %out define amdgpu_kernel void @sextload_i8_to_i32_bfe(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -844,7 +844,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -890,7 +890,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(ptr addrspace(1) %out, pt define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -912,7 +912,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(ptr addrspace(1) %out, pt define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i2_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 5074f8814546ea..0c60be9d94591a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -4,11 +4,11 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec @@ -23,7 +23,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -43,20 +43,20 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 -; GCN-NEXT: s_load_dword s5, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 56 -; GCN-NEXT: s_cselect_b32 s3, 1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: s_cmp_lg_u32 s2, 56 +; GCN-NEXT: s_cselect_b32 s4, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: s_mov_b32 s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 @@ -96,12 +96,12 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -116,7 +116,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0xcccccccd ; GCN-NEXT: s_mov_b32 s5, 0x4010cccc ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -140,12 +140,12 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x10001 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -160,12 +160,12 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -180,7 +180,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 1 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -204,7 +204,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 1.0 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -228,12 +228,12 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -248,7 +248,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -272,7 +272,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -296,7 +296,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -320,7 +320,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -340,11 +340,11 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -359,11 +359,11 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -378,11 +378,11 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -397,11 +397,11 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll index f3654fea486e0c..1d5cc1e1ec0463 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll @@ -40,8 +40,8 @@ define double @v_trig_preop_f64_imm(double %a, i32 %b) { define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -57,8 +57,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; VI-LABEL: s_trig_preop_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -74,8 +74,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; GFX9-LABEL: s_trig_preop_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -86,8 +86,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX10-LABEL: s_trig_preop_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], s2 ; GFX10-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -97,10 +97,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX11-LABEL: s_trig_preop_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[0:1], s2 +; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[2:3], s0 ; GFX11-NEXT: flat_store_b64 v[0:1], v[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm @@ -112,7 +112,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64_imm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; CI-NEXT: s_add_u32 s0, s0, 4 @@ -127,7 +127,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; VI-LABEL: s_trig_preop_f64_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; VI-NEXT: s_add_u32 s0, s0, 4 @@ -142,7 +142,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX9-LABEL: s_trig_preop_f64_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -151,7 +151,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX10-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -160,7 +160,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX11-LABEL: s_trig_preop_f64_imm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX11-NEXT: flat_store_b64 v[0:1], v[0:1] dlc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll index d7fbec74af3858..43a0f018dc1cd2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -44,7 +44,7 @@ define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i3 define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s3, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -63,7 +63,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 @@ -81,7 +81,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s3, 59, s3 @@ -99,7 +99,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -118,7 +118,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 @@ -135,11 +135,11 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_0_width_imm_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 8 +; GFX6-NEXT: s_bfe_u32 s3, s3, 8 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zextload_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -174,7 +174,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -197,7 +197,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -220,7 +220,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -243,7 +243,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -266,7 +266,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -289,7 +289,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -312,7 +312,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -331,7 +331,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -352,7 +352,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -373,7 +373,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -417,7 +417,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -438,7 +438,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -459,7 +459,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -480,7 +480,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -499,7 +499,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -518,7 +518,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -537,7 +537,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -557,7 +557,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -577,7 +577,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -597,7 +597,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -613,7 +613,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0x302e, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -629,7 +629,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -645,7 +645,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -661,7 +661,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, -1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -677,7 +677,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x10007 ; GFX6-NEXT: s_bfe_u32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -694,7 +694,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_u32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -711,7 +711,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_u32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -728,7 +728,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80006 ; GFX6-NEXT: s_bfe_u32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -745,7 +745,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80010 ; GFX6-NEXT: s_bfe_u32 s2, 0x10000, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -762,7 +762,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_u32 s2, 0xffff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -779,7 +779,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x40004 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -796,7 +796,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -813,7 +813,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_u32 s2, 0x1fffe, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -830,7 +830,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1e0002 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -847,7 +847,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1c0004 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -864,7 +864,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, -1, 0x70001 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -880,7 +880,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_17: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1f0001 ; GFX6-NEXT: s_bfe_u32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -897,7 +897,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_18: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_u32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -918,8 +918,8 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0, ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -947,11 +947,11 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0 define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: lshr_and: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -965,7 +965,7 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: v_lshr_and: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s3, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s3, 7 @@ -983,11 +983,11 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: and_lshr: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1001,11 +1001,11 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: and_lshr2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1019,11 +1019,11 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: shl_lshr: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x150002 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x150002 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 2198ba9f1d964d..727184a36c0067 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -19,7 +19,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -30,7 +30,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -46,7 +46,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) { ; GFX8-LABEL: update_dppi64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -64,7 +64,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX10-LABEL: update_dppi64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -78,11 +78,10 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX11-LABEL: update_dppi64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -102,7 +101,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) { ; GFX8-LABEL: update_dppf64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -120,7 +119,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX10-LABEL: update_dppf64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -134,11 +133,10 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX11-LABEL: update_dppf64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -158,7 +156,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) { ; GFX8-LABEL: update_dppv2i32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -176,7 +174,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX10-LABEL: update_dppv2i32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -190,11 +188,10 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX11-LABEL: update_dppv2i32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -214,7 +211,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) { ; GFX8-LABEL: update_dppv2f32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -232,7 +229,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX10-LABEL: update_dppv2f32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -246,11 +243,10 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX11-LABEL: update_dppv2f32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -270,7 +266,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p0_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -288,7 +284,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX10-LABEL: update_dpp_p0_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -302,11 +298,10 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX11-LABEL: update_dpp_p0_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -326,7 +321,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p3_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -341,7 +336,7 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p3_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -354,8 +349,7 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX11-LABEL: update_dpp_p3_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -377,11 +371,11 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; GFX8-LABEL: update_dpp_p5_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s90, -1 ; GFX8-NEXT: s_mov_b32 s91, 0xe80000 -; GFX8-NEXT: s_add_u32 s88, s88, s9 +; GFX8-NEXT: s_add_u32 s88, s88, s3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_addc_u32 s89, s89, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -396,27 +390,26 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p5_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s14, -1 -; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-NEXT: s_add_u32 s12, s12, s9 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GFX10-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dpp_p5_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll index b2546700a935db..df201c1903b642 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll @@ -1,8 +1,7 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o %t.bc -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll index d5646820a19832..09882c446fc0fc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll @@ -1,14 +1,12 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v4.ll -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v6.ll -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -verify-machineinstrs -amdgpu-enable-vopd=0 < %t.v6.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs| FileCheck --check-prefixes=ALL,HSA,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 646cb48d37367b..36bac87889cacd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @localize_constants(i1 %cond) { ; GFX9-LABEL: localize_constants: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s1, s1, 1 @@ -95,7 +95,7 @@ bb2: define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-LABEL: localize_globals: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s1, s1, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 66037615f0ba03..2727fdec035d22 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -10,7 +10,7 @@ ; Note: we use MIR test checks + stop after legalizer to prevent ; tests from being optimized out. -define amdgpu_kernel void @system_one_as_acquire() #0 { +define amdgpu_kernel void @system_one_as_acquire() { ; GFX6-LABEL: name: system_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -59,7 +59,7 @@ entry: ret void } -define amdgpu_kernel void @system_one_as_release() #0 { +define amdgpu_kernel void @system_one_as_release() { ; GFX6-LABEL: name: system_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -98,7 +98,7 @@ entry: ret void } -define amdgpu_kernel void @system_one_as_acq_rel() #0 { +define amdgpu_kernel void @system_one_as_acq_rel() { ; GFX6-LABEL: name: system_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -147,7 +147,7 @@ entry: ret void } -define amdgpu_kernel void @system_one_as_seq_cst() #0 { +define amdgpu_kernel void @system_one_as_seq_cst() { ; GFX6-LABEL: name: system_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -196,7 +196,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_acquire() #0 { +define amdgpu_kernel void @singlethread_one_as_acquire() { ; GFX6-LABEL: name: singlethread_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -225,7 +225,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_release() #0 { +define amdgpu_kernel void @singlethread_one_as_release() { ; GFX6-LABEL: name: singlethread_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -254,7 +254,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_acq_rel() #0 { +define amdgpu_kernel void @singlethread_one_as_acq_rel() { ; GFX6-LABEL: name: singlethread_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -283,7 +283,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_seq_cst() #0 { +define amdgpu_kernel void @singlethread_one_as_seq_cst() { ; GFX6-LABEL: name: singlethread_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -312,7 +312,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_acquire() #0 { +define amdgpu_kernel void @agent_one_as_acquire() { ; GFX6-LABEL: name: agent_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -361,7 +361,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_release() #0 { +define amdgpu_kernel void @agent_one_as_release() { ; GFX6-LABEL: name: agent_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -400,7 +400,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_acq_rel() #0 { +define amdgpu_kernel void @agent_one_as_acq_rel() { ; GFX6-LABEL: name: agent_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -449,7 +449,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_seq_cst() #0 { +define amdgpu_kernel void @agent_one_as_seq_cst() { ; GFX6-LABEL: name: agent_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 3952 @@ -498,7 +498,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_acquire() #0 { +define amdgpu_kernel void @workgroup_one_as_acquire() { ; GFX6-LABEL: name: workgroup_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -533,7 +533,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_release() #0 { +define amdgpu_kernel void @workgroup_one_as_release() { ; GFX6-LABEL: name: workgroup_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -566,7 +566,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { +define amdgpu_kernel void @workgroup_one_as_acq_rel() { ; GFX6-LABEL: name: workgroup_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -601,7 +601,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { +define amdgpu_kernel void @workgroup_one_as_seq_cst() { ; GFX6-LABEL: name: workgroup_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -636,7 +636,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_acquire() #0 { +define amdgpu_kernel void @wavefront_one_as_acquire() { ; GFX6-LABEL: name: wavefront_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -665,7 +665,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_release() #0 { +define amdgpu_kernel void @wavefront_one_as_release() { ; GFX6-LABEL: name: wavefront_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -694,7 +694,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_acq_rel() #0 { +define amdgpu_kernel void @wavefront_one_as_acq_rel() { ; GFX6-LABEL: name: wavefront_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -723,7 +723,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_seq_cst() #0 { +define amdgpu_kernel void @wavefront_one_as_seq_cst() { ; GFX6-LABEL: name: wavefront_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -752,7 +752,7 @@ entry: ret void } -define amdgpu_kernel void @system_acquire() #0 { +define amdgpu_kernel void @system_acquire() { ; GFX6-LABEL: name: system_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -801,7 +801,7 @@ entry: ret void } -define amdgpu_kernel void @system_release() #0 { +define amdgpu_kernel void @system_release() { ; GFX6-LABEL: name: system_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -840,7 +840,7 @@ entry: ret void } -define amdgpu_kernel void @system_acq_rel() #0 { +define amdgpu_kernel void @system_acq_rel() { ; GFX6-LABEL: name: system_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -889,7 +889,7 @@ entry: ret void } -define amdgpu_kernel void @system_seq_cst() #0 { +define amdgpu_kernel void @system_seq_cst() { ; GFX6-LABEL: name: system_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -938,7 +938,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_acquire() #0 { +define amdgpu_kernel void @singlethread_acquire() { ; GFX6-LABEL: name: singlethread_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -967,7 +967,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_release() #0 { +define amdgpu_kernel void @singlethread_release() { ; GFX6-LABEL: name: singlethread_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -996,7 +996,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_acq_rel() #0 { +define amdgpu_kernel void @singlethread_acq_rel() { ; GFX6-LABEL: name: singlethread_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1025,7 +1025,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_seq_cst() #0 { +define amdgpu_kernel void @singlethread_seq_cst() { ; GFX6-LABEL: name: singlethread_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1054,7 +1054,7 @@ entry: ret void } -define amdgpu_kernel void @agent_acquire() #0 { +define amdgpu_kernel void @agent_acquire() { ; GFX6-LABEL: name: agent_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -1103,7 +1103,7 @@ entry: ret void } -define amdgpu_kernel void @agent_release() #0 { +define amdgpu_kernel void @agent_release() { ; GFX6-LABEL: name: agent_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -1142,7 +1142,7 @@ entry: ret void } -define amdgpu_kernel void @agent_acq_rel() #0 { +define amdgpu_kernel void @agent_acq_rel() { ; GFX6-LABEL: name: agent_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -1191,7 +1191,7 @@ entry: ret void } -define amdgpu_kernel void @agent_seq_cst() #0 { +define amdgpu_kernel void @agent_seq_cst() { ; GFX6-LABEL: name: agent_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 112 @@ -1240,7 +1240,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_acquire() #0 { +define amdgpu_kernel void @workgroup_acquire() { ; GFX6-LABEL: name: workgroup_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 127 @@ -1279,7 +1279,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_release() #0 { +define amdgpu_kernel void @workgroup_release() { ; GFX6-LABEL: name: workgroup_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 127 @@ -1316,7 +1316,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_acq_rel() #0 { +define amdgpu_kernel void @workgroup_acq_rel() { ; GFX6-LABEL: name: workgroup_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 127 @@ -1355,7 +1355,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_seq_cst() #0 { +define amdgpu_kernel void @workgroup_seq_cst() { ; GFX6-LABEL: name: workgroup_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_WAITCNT_soft 127 @@ -1394,7 +1394,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_acquire() #0 { +define amdgpu_kernel void @wavefront_acquire() { ; GFX6-LABEL: name: wavefront_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1423,7 +1423,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_release() #0 { +define amdgpu_kernel void @wavefront_release() { ; GFX6-LABEL: name: wavefront_release ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1452,7 +1452,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_acq_rel() #0 { +define amdgpu_kernel void @wavefront_acq_rel() { ; GFX6-LABEL: name: wavefront_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1481,7 +1481,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_seq_cst() #0 { +define amdgpu_kernel void @wavefront_seq_cst() { ; GFX6-LABEL: name: wavefront_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: S_ENDPGM 0 @@ -1509,5 +1509,3 @@ entry: fence syncscope("wavefront") seq_cst ret void } - -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 577a7d0b4cba0b..1140ef88ac7f85 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; GFX10-LABEL: v_mul_i64_no_zext: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -23,9 +23,7 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: v_mul_i64_no_zext: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -58,13 +56,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_mul_i64_zext_src1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX10-NEXT: global_load_dword v4, v3, s[0:1] +; GFX10-NEXT: global_load_dword v4, v3, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v0, v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 @@ -77,10 +75,8 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_mul_i64_zext_src1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -112,13 +108,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_mul_i64_zext_src0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 @@ -131,10 +127,8 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_mul_i64_zext_src0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -166,13 +160,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_zext_src0_src1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -182,12 +176,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_zext_src0_src1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] @@ -215,13 +207,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src0_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 @@ -234,10 +226,8 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_masked_src0_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -269,13 +259,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src0_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -286,10 +276,8 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_masked_src0_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -319,16 +307,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src1_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: ; kill: killed $vgpr3 ; GFX10-NEXT: ; kill: killed $sgpr6_sgpr7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] -; GFX10-NEXT: ; kill: killed $sgpr0_sgpr1 +; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] +; GFX10-NEXT: ; kill: killed $sgpr2_sgpr3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -338,10 +326,8 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_masked_src1_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -369,7 +355,7 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; GFX10-LABEL: v_mul_i64_masked_src0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -379,7 +365,7 @@ define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_mul_i64_masked_src0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -403,13 +389,13 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX10-LABEL: v_mul_i64_partially_masked_src0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -426,10 +412,8 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX11-LABEL: v_mul_i64_partially_masked_src0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -466,7 +450,7 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; GFX10-LABEL: v_mul64_masked_before_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -476,7 +460,7 @@ define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_mul64_masked_before_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -514,13 +498,13 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX10-LABEL: v_mul64_masked_before_and_in_branch: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3] ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3] @@ -549,10 +533,8 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-LABEL: v_mul64_masked_before_and_in_branch: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index b0f3eee3c73632..2d81452f9ef38d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2542,7 +2542,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: s_mul_u64_zext_with_sregs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2559,7 +2559,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: s_mul_u64_zext_with_sregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -2576,7 +2576,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: s_mul_u64_zext_with_sregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2590,7 +2590,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: s_mul_u64_zext_with_sregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2604,7 +2604,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: s_mul_u64_zext_with_sregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0 @@ -2619,7 +2619,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: s_mul_u64_zext_with_sregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -2718,7 +2718,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: s_mul_u64_sext_with_sregs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2738,7 +2738,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: s_mul_u64_sext_with_sregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -2758,7 +2758,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: s_mul_u64_sext_with_sregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2775,7 +2775,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: s_mul_u64_sext_with_sregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -2792,7 +2792,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: s_mul_u64_sext_with_sregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -2810,7 +2810,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: s_mul_u64_sext_with_sregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index c7afbeabbbb6b1..eaaeb3dc77a419 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -13,33 +13,33 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[6:7], 0x8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_load_dword s4, s[6:7], 0xc +; GCN-NEXT: s_load_dword s6, s[4:5], 0xc ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_load_dword s5, s[6:7], 0x10 -; GCN-NEXT: s_add_u32 s4, s32, 0x1000 +; GCN-NEXT: s_load_dword s7, s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s6, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s5, s5, 2 -; GCN-NEXT: s_add_u32 s4, s4, s5 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_add_u32 s6, s6, s7 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -84,29 +84,29 @@ bb.2: define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[6:7], 0x8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_load_dword s4, s[6:7], 0xc -; GCN-NEXT: s_add_u32 s5, s32, 0x1000 -; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xc +; GCN-NEXT: s_add_u32 s7, s32, 0x1000 +; GCN-NEXT: s_and_b32 s7, s7, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: s_lshl_b32 s6, s6, 2 ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_add_u32 s6, s7, s6 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index cf69c50ed93572..b666f45521661c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -6,83 +6,83 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: sdivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s8, s5, 31 -; GFX8-NEXT: s_add_i32 s0, s5, s8 -; GFX8-NEXT: s_xor_b32 s5, s0, s8 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_ashr_i32 s8, s7, 31 +; GFX8-NEXT: s_add_i32 s0, s7, s8 +; GFX8-NEXT: s_xor_b32 s7, s0, s8 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_ashr_i32 s4, s6, 31 +; GFX8-NEXT: s_add_i32 s5, s6, s4 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s7, s6, s8 +; GFX8-NEXT: s_xor_b32 s5, s5, s4 +; GFX8-NEXT: s_xor_b32 s6, s4, s8 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s7, v2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -95,17 +95,16 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s6 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s5, s1, s4 +; GFX10-NEXT: s_xor_b32 s7, s1, s6 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -113,17 +112,18 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s8, s6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -145,7 +145,7 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: sdivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_ashr_i32 s12, s11, 31 @@ -305,7 +305,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s9, 31 ; GFX9-NEXT: s_ashr_i32 s12, s11, 31 @@ -459,7 +459,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s2, s9, 31 ; GFX10-NEXT: s_ashr_i32 s12, s11, 31 @@ -616,7 +616,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s10, 31 ; GFX8-NEXT: s_add_i32 s0, s10, s2 @@ -692,7 +692,7 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s8, s6, 31 ; GFX9-NEXT: s_add_i32 s6, s6, s8 @@ -765,7 +765,7 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s1, s10, 31 ; GFX10-NEXT: s_ashr_i32 s2, s11, 31 @@ -845,8 +845,8 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s12, 31 ; GFX8-NEXT: s_add_i32 s0, s12, s2 @@ -986,19 +986,19 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s12, 31 -; GFX9-NEXT: s_add_i32 s0, s12, s4 -; GFX9-NEXT: s_xor_b32 s5, s0, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_ashr_i32 s6, s13, 31 -; GFX9-NEXT: s_add_i32 s7, s13, s6 +; GFX9-NEXT: s_ashr_i32 s6, s12, 31 +; GFX9-NEXT: s_add_i32 s0, s12, s6 +; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_ashr_i32 s4, s13, 31 +; GFX9-NEXT: s_add_i32 s5, s13, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s7, s7, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s13, 0, s5 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_sub_i32 s13, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_xor_b32 s8, s8, s12 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_sub_i32 s13, 0, s7 +; GFX9-NEXT: s_sub_i32 s13, 0, s5 ; GFX9-NEXT: v_mul_lo_u32 v3, s13, v1 ; GFX9-NEXT: s_ashr_i32 s13, s9, 31 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 @@ -1017,62 +1017,62 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 ; GFX9-NEXT: s_add_i32 s9, s9, s13 ; GFX9-NEXT: s_xor_b32 s9, s9, s13 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v2 -; GFX9-NEXT: s_xor_b32 s4, s12, s4 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v2 +; GFX9-NEXT: s_xor_b32 s6, s12, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 -; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: s_ashr_i32 s4, s14, 31 -; GFX9-NEXT: s_add_i32 s5, s14, s4 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 +; GFX9-NEXT: s_ashr_i32 s6, s14, 31 +; GFX9-NEXT: s_add_i32 s7, s14, s6 ; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_xor_b32 s7, s7, s6 ; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 -; GFX9-NEXT: s_sub_i32 s8, 0, s5 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: s_sub_i32 s8, 0, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s8, v3 -; GFX9-NEXT: s_xor_b32 s6, s13, s6 -; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 -; GFX9-NEXT: s_ashr_i32 s6, s15, 31 -; GFX9-NEXT: s_add_i32 s9, s15, s6 +; GFX9-NEXT: s_xor_b32 s4, s13, s4 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: s_ashr_i32 s4, s15, 31 +; GFX9-NEXT: s_add_i32 s9, s15, s4 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: s_xor_b32 s9, s9, s6 +; GFX9-NEXT: s_xor_b32 s9, s9, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s9 -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 -; GFX9-NEXT: s_ashr_i32 s7, s10, 31 -; GFX9-NEXT: s_add_i32 s8, s10, s7 -; GFX9-NEXT: s_xor_b32 s8, s8, s7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: s_ashr_i32 s5, s10, 31 +; GFX9-NEXT: s_add_i32 s8, s10, s5 +; GFX9-NEXT: s_xor_b32 s8, s8, s5 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s5 +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s7 ; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v2 @@ -1080,27 +1080,27 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_sub_i32 s8, 0, s9 ; GFX9-NEXT: v_mul_lo_u32 v8, s8, v7 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 -; GFX9-NEXT: s_ashr_i32 s5, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s5 -; GFX9-NEXT: s_xor_b32 s8, s8, s5 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 +; GFX9-NEXT: s_ashr_i32 s7, s11, 31 +; GFX9-NEXT: s_add_i32 s8, s11, s7 +; GFX9-NEXT: s_xor_b32 s8, s8, s7 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_mul_hi_u32 v7, s8, v7 -; GFX9-NEXT: s_xor_b32 s4, s7, s4 +; GFX9-NEXT: s_xor_b32 s6, s5, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v7, s9 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 -; GFX9-NEXT: s_xor_b32 s4, s5, s6 +; GFX9-NEXT: s_xor_b32 s4, s7, s4 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc @@ -1112,12 +1112,12 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, s7, v6 +; GFX9-NEXT: v_xor_b32_e32 v6, s5, v6 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_xor_b32_e32 v7, s5, v8 +; GFX9-NEXT: v_xor_b32_e32 v7, s7, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v6 -; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] @@ -1125,18 +1125,18 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s0, s12, 31 ; GFX10-NEXT: s_ashr_i32 s1, s13, 31 ; GFX10-NEXT: s_ashr_i32 s2, s14, 31 ; GFX10-NEXT: s_ashr_i32 s3, s15, 31 -; GFX10-NEXT: s_add_i32 s4, s12, s0 -; GFX10-NEXT: s_add_i32 s5, s13, s1 +; GFX10-NEXT: s_add_i32 s6, s12, s0 +; GFX10-NEXT: s_add_i32 s7, s13, s1 ; GFX10-NEXT: s_add_i32 s12, s14, s2 ; GFX10-NEXT: s_add_i32 s13, s15, s3 -; GFX10-NEXT: s_xor_b32 s14, s4, s0 -; GFX10-NEXT: s_xor_b32 s15, s5, s1 +; GFX10-NEXT: s_xor_b32 s14, s6, s0 +; GFX10-NEXT: s_xor_b32 s15, s7, s1 ; GFX10-NEXT: s_xor_b32 s12, s12, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX10-NEXT: s_xor_b32 s13, s13, s3 @@ -1144,11 +1144,11 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s14 +; GFX10-NEXT: s_sub_i32 s6, 0, s14 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: s_sub_i32 s5, 0, s15 +; GFX10-NEXT: s_sub_i32 s7, 0, s15 ; GFX10-NEXT: s_sub_i32 s19, 0, s12 ; GFX10-NEXT: s_ashr_i32 s16, s8, 31 ; GFX10-NEXT: s_ashr_i32 s17, s9, 31 @@ -1163,22 +1163,22 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, s4, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s13 -; GFX10-NEXT: v_mul_lo_u32 v5, s5, v1 +; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 +; GFX10-NEXT: s_sub_i32 s6, 0, s13 +; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s4, v3 +; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 ; GFX10-NEXT: s_ashr_i32 s19, s11, 31 -; GFX10-NEXT: s_add_i32 s4, s8, s16 -; GFX10-NEXT: s_add_i32 s5, s9, s17 +; GFX10-NEXT: s_add_i32 s6, s8, s16 +; GFX10-NEXT: s_add_i32 s7, s9, s17 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 ; GFX10-NEXT: s_add_i32 s8, s10, s18 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 ; GFX10-NEXT: s_add_i32 s9, s11, s19 -; GFX10-NEXT: s_xor_b32 s10, s4, s16 -; GFX10-NEXT: s_xor_b32 s11, s5, s17 +; GFX10-NEXT: s_xor_b32 s10, s6, s16 +; GFX10-NEXT: s_xor_b32 s11, s7, s17 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 ; GFX10-NEXT: s_xor_b32 s8, s8, s18 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 ; GFX10-NEXT: s_xor_b32 s22, s18, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14 ; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 ; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12 @@ -1271,8 +1271,8 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s4, s13, 31 ; GFX8-NEXT: s_ashr_i32 s6, s1, 31 @@ -1582,8 +1582,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s13, 31 ; GFX9-NEXT: s_ashr_i32 s6, s1, 31 @@ -1885,8 +1885,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-LABEL: sdivrem_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s16, s1, 31 ; GFX10-NEXT: s_ashr_i32 s4, s13, 31 @@ -2187,25 +2187,25 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: sdiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x80008 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_sext_i32_i8 s4, s6 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2222,52 +2222,52 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x80008 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_sext_i32_i8 s0, s0 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -2280,19 +2280,18 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: sdiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s6 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s5, s1, s4 +; GFX10-NEXT: s_xor_b32 s7, s1, s6 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2300,17 +2299,18 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s8, s6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2332,14 +2332,14 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: sdivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: s_xor_b32 s8, s0, s3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s4, 0, s8 +; GFX8-NEXT: s_sub_i32 s6, 0, s8 ; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80018 ; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2351,10 +2351,10 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_ashr_i32 s9, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s9 -; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX8-NEXT: s_xor_b32 s0, s0, s9 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2420,45 +2420,45 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80010 -; GFX9-NEXT: s_ashr_i32 s5, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s5 -; GFX9-NEXT: s_xor_b32 s8, s0, s5 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0x80010 +; GFX9-NEXT: s_ashr_i32 s7, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s7 +; GFX9-NEXT: s_xor_b32 s8, s0, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_bfe_i32 s7, s4, 0x80018 -; GFX9-NEXT: s_ashr_i32 s9, s7, 31 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_bfe_i32 s5, s6, 0x80018 +; GFX9-NEXT: s_ashr_i32 s9, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s7, s7, s9 -; GFX9-NEXT: s_xor_b32 s7, s7, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_xor_b32 s5, s5, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s10, 0, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i8 s6, s4 +; GFX9-NEXT: s_sext_i32_i8 s4, s6 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s6, 31 +; GFX9-NEXT: s_ashr_i32 s10, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s10 -; GFX9-NEXT: s_xor_b32 s6, s6, s10 -; GFX9-NEXT: s_sub_i32 s11, 0, s7 +; GFX9-NEXT: s_add_i32 s4, s4, s10 +; GFX9-NEXT: s_xor_b32 s4, s4, s10 +; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x80008 -; GFX9-NEXT: s_ashr_i32 s11, s4, 31 +; GFX9-NEXT: s_bfe_i32 s6, s6, 0x80008 +; GFX9-NEXT: s_ashr_i32 s11, s6, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_add_i32 s4, s4, s11 +; GFX9-NEXT: s_add_i32 s6, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX9-NEXT: s_xor_b32 s4, s4, s11 +; GFX9-NEXT: s_xor_b32 s4, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 @@ -2469,25 +2469,25 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s5, s10, s5 -; GFX9-NEXT: v_xor_b32_e32 v0, s5, v0 +; GFX9-NEXT: s_xor_b32 s6, s10, s7 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: s_xor_b32 s4, s11, s9 ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_subrev_u32_e32 v0, s5, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 @@ -2505,7 +2505,7 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80018 ; GFX10-NEXT: s_bfe_i32 s3, s0, 0x80010 @@ -2517,36 +2517,36 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: s_xor_b32 s3, s3, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s4, 0, s1 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s4, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s3 -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX10-NEXT: s_bfe_i32 s4, s0, 0x80008 +; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX10-NEXT: s_sub_i32 s6, 0, s3 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s9, s4, 31 +; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s4, s4, s9 +; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s4, s4, s9 +; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 @@ -2596,25 +2596,25 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: sdiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x100010 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_sext_i32_i16 s4, s6 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2631,52 +2631,52 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x100010 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -2689,19 +2689,18 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: sdiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x100010 ; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s6 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s5, s1, s4 +; GFX10-NEXT: s_xor_b32 s7, s1, s6 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2709,17 +2708,18 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s8, s6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2741,14 +2741,14 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 ; GFX8-NEXT: s_ashr_i32 s8, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s8 ; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX8-NEXT: s_sub_i32 s4, 0, s9 +; GFX8-NEXT: s_sub_i32 s6, 0, s9 ; GFX8-NEXT: s_bfe_i32 s1, s3, 0x100010 ; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2760,10 +2760,10 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX8-NEXT: s_xor_b32 s0, s0, s3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2829,15 +2829,15 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s0, s5 +; GFX9-NEXT: s_sext_i32_i16 s0, s7 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_bfe_i32 s5, s5, 0x100010 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x100010 ; GFX9-NEXT: s_ashr_i32 s7, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s5, s5, s7 @@ -2847,27 +2847,27 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s10, 0, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i16 s6, s4 +; GFX9-NEXT: s_sext_i32_i16 s4, s6 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s6, 31 +; GFX9-NEXT: s_ashr_i32 s10, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s10 -; GFX9-NEXT: s_xor_b32 s6, s6, s10 +; GFX9-NEXT: s_add_i32 s4, s4, s10 +; GFX9-NEXT: s_xor_b32 s4, s4, s10 ; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x100010 -; GFX9-NEXT: s_ashr_i32 s11, s4, 31 +; GFX9-NEXT: s_bfe_i32 s6, s6, 0x100010 +; GFX9-NEXT: s_ashr_i32 s11, s6, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s9 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_add_i32 s4, s4, s11 +; GFX9-NEXT: s_add_i32 s6, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX9-NEXT: s_xor_b32 s4, s4, s11 +; GFX9-NEXT: s_xor_b32 s4, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3 @@ -2912,7 +2912,7 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s1 ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x100010 @@ -2924,36 +2924,36 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: s_xor_b32 s1, s1, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX10-NEXT: s_sub_i32 s4, 0, s2 +; GFX10-NEXT: s_sub_i32 s6, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s4, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX10-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_sext_i32_i16 s6, s0 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX10-NEXT: s_ashr_i32 s9, s4, 31 +; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s4, s4, s9 +; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s4, s4, s9 +; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 @@ -3002,25 +3002,25 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: sdivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x30008 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_bfe_i32 s4, s4, 0x30000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_bfe_i32 s4, s6, 0x30000 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3037,12 +3037,12 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 @@ -3052,39 +3052,39 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: sdivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x30008 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x30000 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -3099,19 +3099,18 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: sdivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x30008 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x30000 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: s_ashr_i32 s5, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s0, s0, s5 -; GFX10-NEXT: s_xor_b32 s1, s1, s4 -; GFX10-NEXT: s_xor_b32 s0, s0, s5 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s7, s0, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s0, s0, s7 +; GFX10-NEXT: s_xor_b32 s1, s1, s6 +; GFX10-NEXT: s_xor_b32 s0, s0, s7 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: s_sub_i32 s2, 0, s1 -; GFX10-NEXT: s_xor_b32 s4, s5, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -3129,14 +3128,15 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s7, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 7, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3153,25 +3153,25 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s7, 0x1b0000 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_bfe_i32 s4, s4, 0x1b0000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_bfe_i32 s4, s6, 0x1b0000 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3188,12 +3188,12 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 @@ -3203,39 +3203,39 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s1, 0x1b0000 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x1b0000 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -3250,19 +3250,18 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x1b0000 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x1b0000 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: s_ashr_i32 s5, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s0, s0, s5 -; GFX10-NEXT: s_xor_b32 s1, s1, s4 -; GFX10-NEXT: s_xor_b32 s0, s0, s5 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s7, s0, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s0, s0, s7 +; GFX10-NEXT: s_xor_b32 s1, s1, s6 +; GFX10-NEXT: s_xor_b32 s0, s0, s7 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: s_sub_i32 s2, 0, s1 -; GFX10-NEXT: s_xor_b32 s4, s5, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -3280,14 +3279,15 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s7, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 3729f1cc2b12d9..7ad19a47970039 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -238,7 +238,7 @@ define i64 @v_shl_i64_sext_i32_overflow(i32 %x) { define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GFX7-LABEL: mulu24_shl64: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 @@ -251,7 +251,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX8-LABEL: mulu24_shl64: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX8-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -266,7 +266,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: mulu24_shl64: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -281,7 +281,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX10-LABEL: mulu24_shl64: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mul_u32_u24_e32 v0, 7, v0 @@ -296,7 +296,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX11-LABEL: mulu24_shl64: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 6, v0 ; GFX11-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] @@ -321,7 +321,7 @@ bb: define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1) { ; GFX7-LABEL: muli24_shl64: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -340,7 +340,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX8-LABEL: muli24_shl64: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -363,7 +363,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX9-LABEL: muli24_shl64: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -378,7 +378,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX10-LABEL: muli24_shl64: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -393,17 +393,16 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX11-LABEL: muli24_shl64: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v0, 0xff800000, v0 -; GFX11-NEXT: v_mul_i32_i24_e32 v0, -7, v0 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_or_b32_e32 v1, 0xff800000, v1 +; GFX11-NEXT: v_mul_i32_i24_e32 v1, -7, v1 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 2d85081f5fc969..99aaec458c33ec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -10,21 +10,21 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -38,22 +38,22 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; GFX10-LABEL: store_lds_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -67,21 +67,21 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8 @@ -123,8 +123,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 @@ -177,29 +177,29 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshr_b32 s2, s5, 16 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s2, s2, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: s_lshr_b32 s4, s6, 16 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s1, s3, 8 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: s_lshr_b32 s0, s5, 8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s0, s3, 8 +; GFX10-NEXT: s_lshr_b32 s3, s2, 8 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s1 +; GFX10-NEXT: v_mov_b32_e32 v8, s0 ; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 @@ -209,7 +209,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v10, s0 +; GFX10-NEXT: v_mov_b32_e32 v10, s2 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 @@ -234,8 +234,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 @@ -289,14 +289,14 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 ; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -317,8 +317,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s1, s4, 16 @@ -347,20 +347,20 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 +; GFX10-NEXT: s_lshr_b32 s1, s5, 16 ; GFX10-NEXT: s_lshr_b32 s2, s6, 16 ; GFX10-NEXT: s_lshr_b32 s3, s7, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s7 -; GFX10-NEXT: v_mov_b32_e32 v5, s1 -; GFX10-NEXT: v_mov_b32_e32 v6, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, s3 ; GFX10-NEXT: ds_write_b16 v1, v0 @@ -376,8 +376,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 @@ -404,11 +404,11 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -418,8 +418,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -434,11 +434,11 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, s7 @@ -449,8 +449,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 @@ -465,21 +465,21 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -493,11 +493,11 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, s7 @@ -508,8 +508,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -523,21 +523,21 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -551,22 +551,22 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; GFX10-LABEL: store_lds_v4i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index 4ef79b752c4373..0f9ec965f2f0f4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -10,20 +10,20 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -36,21 +36,21 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) ; GFX10-LABEL: store_lds_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 @@ -63,21 +63,21 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8 @@ -107,8 +107,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 @@ -150,32 +150,32 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshr_b32 s2, s5, 16 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: s_lshr_b32 s4, s6, 16 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: s_lshr_b32 s1, s3, 8 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s2, s2, 8 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_lshr_b32 s0, s5, 8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: s_lshr_b32 s0, s3, 8 +; GFX10-NEXT: s_lshr_b32 s3, s2, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 8 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v9, s3 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: v_mov_b32_e32 v10, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: v_mov_b32_e32 v8, s0 +; GFX10-NEXT: v_mov_b32_e32 v10, s2 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8 ; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s1 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 @@ -195,8 +195,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 @@ -237,14 +237,14 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 ; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -260,8 +260,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s1, s4, 16 @@ -285,18 +285,18 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 +; GFX10-NEXT: s_lshr_b32 s1, s5, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: s_lshr_b32 s2, s6, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: ds_write_b16 v1, v0 ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 @@ -309,8 +309,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 @@ -333,11 +333,11 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -346,8 +346,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -361,11 +361,11 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -375,8 +375,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 @@ -390,11 +390,11 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -403,8 +403,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -418,11 +418,11 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -432,8 +432,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 @@ -447,20 +447,20 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -473,21 +473,21 @@ define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i ; GFX10-LABEL: store_lds_v3i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 8b94f93e44e561..a58397eccaba76 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -6,32 +6,32 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s6, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -41,30 +41,30 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s0, 0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -73,28 +73,28 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s0, 0, s5 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s0, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -112,7 +112,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: udivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -251,7 +251,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -384,7 +384,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -522,7 +522,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -576,7 +576,7 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -627,7 +627,7 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 @@ -685,8 +685,8 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -783,7 +783,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -792,7 +792,6 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX9-NEXT: s_sub_i32 s4, 0, s14 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -800,7 +799,8 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_sub_i32 s4, 0, s14 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 @@ -878,9 +878,9 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -979,8 +979,8 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1248,7 +1248,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1510,7 +1510,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 @@ -1546,9 +1546,9 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4] ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0 ; GFX10-NEXT: s_subb_u32 s3, 0, s15 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, s1, v7, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s6, s1, v7, v[4:5] ; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s3, v8, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s3, v8, v[5:6] ; GFX10-NEXT: v_mul_lo_u32 v1, v10, v2 ; GFX10-NEXT: v_mul_hi_u32 v5, v8, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, v10, v2 @@ -1560,39 +1560,39 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 ; GFX10-NEXT: v_mul_hi_u32 v3, v9, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, v10, v0 -; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v2, s4, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v5 +; GFX10-NEXT: v_add_co_u32 v6, s6, v6, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v11, s6, v13, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v2, s6, v16, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v4, s6, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v6, s6, v11, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v2, s6, v2, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v4, s6, v6, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v15, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v16, v5 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v2, v1 +; GFX10-NEXT: v_add_co_u32 v1, s6, v2, v1 ; GFX10-NEXT: v_add3_u32 v3, v11, v6, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v3, vcc_lo ; GFX10-NEXT: v_add3_u32 v2, v5, v2, v0 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s0, v7, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s0, v7, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, s2, v8, 0 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s6, s2, v8, 0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4] @@ -1772,34 +1772,34 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: udiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX8-NEXT: s_bfe_u32 s7, s6, 0x80008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 0xff +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 0xff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -1809,32 +1809,32 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 0xff +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0xff ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v2, v0, s[0:1] @@ -1843,12 +1843,12 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: udiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1856,17 +1856,17 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -1884,8 +1884,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: udivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -1949,55 +1949,55 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80010 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s5, s0, 24 +; GFX9-NEXT: s_lshr_b32 s7, s0, 24 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_sub_i32 s2, 0, s5 +; GFX9-NEXT: s_sub_i32 s2, 0, s7 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: s_and_b32 s8, s0, 0xff ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2012,7 +2012,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80010 @@ -2020,7 +2020,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s1 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -2029,8 +2029,8 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2081,34 +2081,34 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: udiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s5, s4, 16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: s_lshr_b32 s7, s6, 16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 0xffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -2118,32 +2118,32 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0xffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v2, v0, s[0:1] @@ -2152,12 +2152,12 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: udiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2165,17 +2165,17 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -2193,8 +2193,8 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: udivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2258,7 +2258,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: s_sub_i32 s1, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2274,10 +2274,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16 -; GFX9-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2319,14 +2319,14 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2387,34 +2387,34 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: udivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX8-NEXT: s_bfe_u32 s7, s6, 0x30008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 7 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -2426,32 +2426,32 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: udivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 7 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2462,12 +2462,12 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: udivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x30008 +; GFX10-NEXT: s_bfe_u32 s6, s0, 0x30008 ; GFX10-NEXT: s_and_b32 s0, s0, 7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2475,17 +2475,17 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2505,34 +2505,34 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 0x7ffffff +; GFX8-NEXT: s_and_b32 s7, s7, 0x7ffffff +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 0x7ffffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2544,32 +2544,32 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s1, 0x7ffffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 0x7ffffff +; GFX9-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0x7ffffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2580,12 +2580,12 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s1, 0x7ffffff +; GFX10-NEXT: s_and_b32 s6, s1, 0x7ffffff ; GFX10-NEXT: s_and_b32 s0, s0, 0x7ffffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2593,17 +2593,17 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index c9a9eb9d917249..83cb92210ec84a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v3i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 ; GFX906-NEXT: v_mov_b32_e32 v5, 16 @@ -18,7 +18,7 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v0, v2, s[6:7] @@ -28,7 +28,7 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0 ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4 ; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 @@ -38,8 +38,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: global_store_short v1, v0, s[0:1] -; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[0:1] offset:2 +; GFX906-NEXT: global_store_short v1, v0, s[2:3] +; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[2:3] offset:2 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -61,21 +61,21 @@ bb.2: define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v4i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dword v1, v2, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v1, v2, s[6:7] ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -97,30 +97,30 @@ bb.2: define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v5i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX906-NEXT: global_store_byte v4, v1, s[0:1] -; GFX906-NEXT: global_store_byte v4, v0, s[0:1] offset:1 -; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[0:1] offset:2 -; GFX906-NEXT: global_store_byte v4, v3, s[0:1] offset:3 -; GFX906-NEXT: global_store_byte v4, v2, s[0:1] offset:4 +; GFX906-NEXT: global_store_byte v4, v1, s[2:3] +; GFX906-NEXT: global_store_byte v4, v0, s[2:3] offset:1 +; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[2:3] offset:2 +; GFX906-NEXT: global_store_byte v4, v3, s[2:3] offset:3 +; GFX906-NEXT: global_store_byte v4, v2, s[2:3] offset:4 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -142,21 +142,21 @@ bb.2: define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v8i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -178,21 +178,21 @@ bb.2: define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v16i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7] ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -214,25 +214,25 @@ bb.2: define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5] ; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7] ; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16 ; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 +; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[2:3] offset:16 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -254,24 +254,24 @@ bb.2: define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v256i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s10, -1 +; GFX906-NEXT: s_mov_b32 s11, 0xe00000 +; GFX906-NEXT: s_add_u32 s8, s8, s3 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_addc_u32 s9, s9, 0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] -; GFX906-NEXT: s_mov_b32 s14, -1 -; GFX906-NEXT: s_mov_b32 s15, 0xe00000 -; GFX906-NEXT: s_add_u32 s12, s12, s9 -; GFX906-NEXT: s_addc_u32 s13, s13, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] offset:16 ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[4:5] offset:32 @@ -288,16 +288,16 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[4:5] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[4:5] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] offset:240 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[6:7] offset:16 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[6:7] offset:32 ; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[6:7] offset:48 @@ -314,13 +314,13 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[6:7] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] offset:240 ; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill ; GFX906-NEXT: v_mov_b32_e32 v0, v57 ; GFX906-NEXT: v_mov_b32_e32 v1, v58 ; GFX906-NEXT: v_mov_b32_e32 v2, v59 @@ -377,34 +377,34 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_mov_b32_e32 v11, v7 ; GFX906-NEXT: v_mov_b32_e32 v10, v6 ; GFX906-NEXT: v_mov_b32_e32 v9, v5 -; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1] -; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[0:1] offset:16 -; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[0:1] offset:32 -; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[0:1] offset:48 -; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[0:1] offset:64 -; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[0:1] offset:80 -; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[0:1] offset:96 -; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112 -; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:128 -; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[0:1] offset:144 -; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[0:1] offset:160 -; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[0:1] offset:176 -; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[0:1] offset:192 -; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:208 -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:224 -; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[2:3] +; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[2:3] offset:16 +; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[2:3] offset:32 +; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[2:3] offset:48 +; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[2:3] offset:64 +; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[2:3] offset:80 +; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[2:3] offset:96 +; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[2:3] offset:112 +; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[2:3] offset:128 +; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[2:3] offset:144 +; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[2:3] offset:160 +; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[2:3] offset:176 +; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[2:3] offset:192 +; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[2:3] offset:208 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224 +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:240 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -427,26 +427,26 @@ bb.2: define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: repeat_successor: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX906-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_cmp_lt_i32 s0, 3 +; GFX906-NEXT: s_cmp_lt_i32 s2, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX906-NEXT: ; %bb.1: ; %LeafBlock -; GFX906-NEXT: s_cmp_ge_i32 s0, 1 +; GFX906-NEXT: s_cmp_ge_i32 s2, 1 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 ; GFX906-NEXT: ; %bb.2: ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX906-NEXT: global_load_dword v0, v0, s[4:5] ; GFX906-NEXT: s_branch .LBB7_5 ; GFX906-NEXT: .LBB7_3: ; %LeafBlock5 -; GFX906-NEXT: s_cmp_eq_u32 s0, 3 +; GFX906-NEXT: s_cmp_eq_u32 s2, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 ; GFX906-NEXT: ; %bb.4: ; %sw.bb5 ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX906-NEXT: global_load_dword v0, v0, s[6:7] ; GFX906-NEXT: .LBB7_5: ; %return.sink.split -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_store_dword v1, v0, s[0:1] @@ -479,7 +479,7 @@ return: define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_chain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -533,7 +533,7 @@ bb.3: define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_multi_block: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -584,14 +584,14 @@ bb.3: define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_loop_carried: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xff ; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v1, v1, s[0:1] -; GFX906-NEXT: s_mov_b64 s[0:1], 0 +; GFX906-NEXT: global_load_dword v1, v1, s[2:3] +; GFX906-NEXT: s_mov_b64 s[2:3], 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0 @@ -602,13 +602,13 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp ; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX906-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GFX906-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GFX906-NEXT: v_or3_b32 v1, v0, v3, v1 -; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_cbranch_execnz .LBB10_1 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll index ef2e57eafbf137..037210a496d6d6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 { ; GFX8-LABEL: constant_load_i8_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a ; ; GFX9-LABEL: constant_load_i8_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -29,7 +29,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a ; ; GFX10-LABEL: constant_load_i8_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -45,7 +45,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 { ; GFX8-LABEL: constant_load_i16_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr ; ; GFX9-LABEL: constant_load_i16_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -68,7 +68,7 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr ; ; GFX10-LABEL: constant_load_i16_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: sextload_i8_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -97,7 +97,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: sextload_i8_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -109,7 +109,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: sextload_i8_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -127,7 +127,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: sextload_i16_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -140,7 +140,7 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: sextload_i16_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: sextload_i16_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -170,7 +170,7 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: zextload_i8_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -183,7 +183,7 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: zextload_i8_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -195,7 +195,7 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: zextload_i8_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -213,7 +213,7 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: zextload_i16_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -226,7 +226,7 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: zextload_i16_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -238,7 +238,7 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: zextload_i16_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -256,7 +256,7 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_load_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -269,7 +269,7 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: constant_load_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -279,7 +279,7 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: constant_load_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -294,7 +294,7 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_load_i16_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -307,7 +307,7 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: constant_load_i16_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -317,7 +317,7 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a ; ; GFX10-LABEL: constant_load_i16_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -332,7 +332,7 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_sextload_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -351,7 +351,7 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: constant_sextload_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3] @@ -362,7 +362,7 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: constant_sextload_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3] @@ -379,7 +379,7 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_zextload_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -398,7 +398,7 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: constant_zextload_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -409,7 +409,7 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: constant_zextload_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index e9797fa1fc309f..422e2747094ce2 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: s_add_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX8-LABEL: s_add_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: s_add_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -47,7 +47,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX10-LABEL: s_add_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -59,7 +59,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX11-LABEL: s_add_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -73,7 +73,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: s_add_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -95,7 +95,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: s_add_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -110,7 +110,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: s_add_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_add_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: s_add_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -153,7 +153,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: s_add_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -169,7 +169,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: s_add_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -193,7 +193,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: s_add_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 @@ -212,7 +212,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: s_add_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -231,7 +231,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_add_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -249,7 +249,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: s_add_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -267,7 +267,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: s_add_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -286,7 +286,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: s_add_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -313,36 +313,36 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) { ; GFX6-LABEL: s_add_v8i32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s23, 0xf000 -; GFX6-NEXT: s_mov_b32 s22, -1 +; GFX6-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s7, s15 -; GFX6-NEXT: s_add_i32 s1, s6, s14 -; GFX6-NEXT: s_add_i32 s2, s5, s13 -; GFX6-NEXT: s_add_i32 s3, s4, s12 -; GFX6-NEXT: s_add_i32 s4, s11, s19 -; GFX6-NEXT: s_add_i32 s5, s10, s18 -; GFX6-NEXT: s_add_i32 s6, s9, s17 -; GFX6-NEXT: s_add_i32 s7, s8, s16 -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 +; GFX6-NEXT: s_add_i32 s11, s11, s19 +; GFX6-NEXT: s_add_i32 s10, s10, s18 +; GFX6-NEXT: s_add_i32 s9, s9, s17 +; GFX6-NEXT: s_add_i32 s8, s8, s16 +; GFX6-NEXT: s_add_i32 s7, s7, s15 +; GFX6-NEXT: s_add_i32 s6, s6, s14 +; GFX6-NEXT: s_add_i32 s5, s5, s13 +; GFX6-NEXT: s_add_i32 s4, s4, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s3 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_add_v8i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s7, s7, s15 ; GFX8-NEXT: s_add_i32 s6, s6, s14 @@ -372,9 +372,9 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; ; GFX9-LABEL: s_add_v8i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s7, s15 ; GFX9-NEXT: s_add_i32 s3, s6, s14 @@ -399,10 +399,9 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; ; GFX10-LABEL: s_add_v8i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s2, s7, s15 ; GFX10-NEXT: s_add_i32 s3, s6, s14 @@ -427,8 +426,8 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX11-LABEL: s_add_v8i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s2, s7, s15 ; GFX11-NEXT: s_add_i32 s3, s6, s14 @@ -453,8 +452,8 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX12-LABEL: s_add_v8i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s7, s15 ; GFX12-NEXT: s_add_co_i32 s3, s6, s14 @@ -484,58 +483,58 @@ entry: define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <16 x i32> %b) { ; GFX6-LABEL: s_add_v16i32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; GFX6-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29 -; GFX6-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s23, 0xf000 -; GFX6-NEXT: s_mov_b32 s22, -1 +; GFX6-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; GFX6-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x29 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s7, s39 -; GFX6-NEXT: s_add_i32 s1, s6, s38 -; GFX6-NEXT: s_add_i32 s2, s5, s37 -; GFX6-NEXT: s_add_i32 s3, s4, s36 -; GFX6-NEXT: s_add_i32 s4, s11, s43 -; GFX6-NEXT: s_add_i32 s5, s10, s42 -; GFX6-NEXT: s_add_i32 s6, s9, s41 -; GFX6-NEXT: s_add_i32 s7, s8, s40 -; GFX6-NEXT: s_add_i32 s8, s15, s47 -; GFX6-NEXT: s_add_i32 s9, s14, s46 -; GFX6-NEXT: s_add_i32 s10, s13, s45 -; GFX6-NEXT: s_add_i32 s11, s12, s44 -; GFX6-NEXT: s_add_i32 s12, s19, s51 -; GFX6-NEXT: s_add_i32 s13, s18, s50 -; GFX6-NEXT: s_add_i32 s14, s17, s49 -; GFX6-NEXT: s_add_i32 s15, s16, s48 -; GFX6-NEXT: v_mov_b32_e32 v0, s15 -; GFX6-NEXT: v_mov_b32_e32 v1, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NEXT: v_mov_b32_e32 v3, s12 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 +; GFX6-NEXT: s_add_i32 s19, s19, s51 +; GFX6-NEXT: s_add_i32 s18, s18, s50 +; GFX6-NEXT: s_add_i32 s17, s17, s49 +; GFX6-NEXT: s_add_i32 s16, s16, s48 +; GFX6-NEXT: s_add_i32 s15, s15, s47 +; GFX6-NEXT: s_add_i32 s14, s14, s46 +; GFX6-NEXT: s_add_i32 s13, s13, s45 +; GFX6-NEXT: s_add_i32 s12, s12, s44 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: s_add_i32 s11, s11, s43 +; GFX6-NEXT: s_add_i32 s10, s10, s42 +; GFX6-NEXT: s_add_i32 s9, s9, s41 +; GFX6-NEXT: s_add_i32 s8, s8, s40 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s11 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: s_add_i32 s7, s7, s39 +; GFX6-NEXT: s_add_i32 s6, s6, s38 +; GFX6-NEXT: s_add_i32 s5, s5, s37 +; GFX6-NEXT: s_add_i32 s4, s4, s36 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s3 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_add_v16i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX8-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX8-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s7, s7, s39 ; GFX8-NEXT: s_add_i32 s6, s6, s38 @@ -591,10 +590,10 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; ; GFX9-LABEL: s_add_v16i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s7, s39 ; GFX9-NEXT: s_add_i32 s3, s6, s38 @@ -638,11 +637,11 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; ; GFX10-LABEL: s_add_v16i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s2, s7, s39 ; GFX10-NEXT: s_add_i32 s3, s6, s38 @@ -685,9 +684,9 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX11-LABEL: s_add_v16i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0xa4 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 +; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s2, s7, s39 ; GFX11-NEXT: s_add_i32 s3, s6, s38 @@ -726,9 +725,9 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX12-LABEL: s_add_v16i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX12-NEXT: s_load_b512 s[36:51], s[2:3], 0xa4 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 +; GFX12-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s7, s39 ; GFX12-NEXT: s_add_co_i32 s3, s6, s38 @@ -772,7 +771,7 @@ entry: define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_add_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -793,7 +792,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX8-LABEL: v_add_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -813,7 +812,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_add_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -827,7 +826,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX10-LABEL: v_add_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -841,11 +840,9 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX11-LABEL: v_add_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -859,11 +856,9 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: v_add_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -887,7 +882,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_add_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -906,7 +901,7 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_add_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -922,7 +917,7 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_add_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -934,7 +929,7 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_add_imm_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -946,10 +941,8 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_add_imm_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -961,10 +954,8 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: v_add_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -985,8 +976,8 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: add64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1001,8 +992,8 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX8-LABEL: add64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s0, s6, s0 @@ -1015,12 +1006,12 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX9-LABEL: add64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 +; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -1029,12 +1020,12 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX10-LABEL: add64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s6, s0 -; GFX10-NEXT: s_addc_u32 s1, s7, s1 +; GFX10-NEXT: s_add_u32 s0, s6, s2 +; GFX10-NEXT: s_addc_u32 s1, s7, s3 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -1043,8 +1034,8 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: add64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s6, s0 ; GFX11-NEXT: s_addc_u32 s1, s7, s1 @@ -1058,8 +1049,8 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-LABEL: add64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -1081,8 +1072,8 @@ entry: define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr addrspace(1) %in) { ; GFX6-LABEL: add64_sgpr_vgpr: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1099,8 +1090,8 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; ; GFX8-LABEL: add64_sgpr_vgpr: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1115,11 +1106,11 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; ; GFX9-LABEL: add64_sgpr_vgpr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -1131,11 +1122,11 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX10-LABEL: add64_sgpr_vgpr: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s6, s0 ; GFX10-NEXT: s_addc_u32 s1, s7, s1 @@ -1147,8 +1138,8 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX11-LABEL: add64_sgpr_vgpr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1164,8 +1155,8 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX12-LABEL: add64_sgpr_vgpr: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1187,7 +1178,7 @@ entry: define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; GFX6-LABEL: add64_in_branch: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -1214,7 +1205,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add64_in_branch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -1240,7 +1231,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -1265,7 +1256,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: add64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB9_4 @@ -1288,7 +1279,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: add64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 @@ -1312,7 +1303,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: add64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB9_4 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index b751be51a97393..6f67ce4de9ce54 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -33,13 +33,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 @@ -49,13 +49,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: v_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 @@ -65,12 +65,10 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: v_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -95,8 +93,8 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; VI-LABEL: s_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -116,37 +114,37 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: s_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 +; GFX10-NEXT: v_pk_add_u16 v1, s0, s1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 @@ -167,7 +165,7 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; VI-LABEL: s_test_add_self_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -186,7 +184,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: s_test_add_self_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -197,7 +195,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX10-LABEL: s_test_add_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -208,7 +206,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: s_test_add_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -228,7 +226,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; VI-LABEL: s_test_add_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -245,7 +243,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_test_add_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -255,7 +253,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_add_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 @@ -264,7 +262,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX11-LABEL: s_test_add_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 @@ -281,7 +279,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x1c8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -300,7 +298,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_add_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -313,7 +311,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_add_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -325,10 +323,8 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_add_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -350,7 +346,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -369,7 +365,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -382,7 +378,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_add_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -394,10 +390,8 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_add_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -418,7 +412,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -437,7 +431,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_add_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -449,7 +443,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_add_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -461,10 +455,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_add_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -485,7 +477,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -503,7 +495,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -515,7 +507,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -527,10 +519,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -552,7 +542,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f80 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -570,7 +560,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -582,7 +572,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -594,10 +584,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -619,8 +607,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -642,14 +630,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -660,13 +648,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -678,12 +666,10 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -713,8 +699,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -738,14 +724,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v2, v3 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 @@ -757,13 +743,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -776,10 +762,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -811,8 +795,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -836,14 +820,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 @@ -854,13 +838,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -872,12 +856,10 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -907,8 +889,8 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -933,13 +915,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 @@ -953,14 +935,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -974,12 +956,10 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 1315d576a83eb6..526d5c946ec7f6 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -1,7 +1,5 @@ -; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri -mattr=-promote-alloca < %s | llc | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 -mattr=-promote-alloca < %s | llc | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s - -target triple = "amdgcn-amd-amdhsa" +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index fb96b9ff2952e8..60f61a67ccf0be 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -245,7 +245,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: no_agpr_no_reserve: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 @@ -303,8 +303,7 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: no_agpr_no_reserve: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 @@ -515,14 +514,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX908-NEXT: s_load_dword s9, s[6:7], 0x18 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX908-NEXT: s_load_dword s9, s[4:5], 0x18 ; GFX908-NEXT: s_mov_b32 s8, 0 -; GFX908-NEXT: s_mov_b32 s7, s8 +; GFX908-NEXT: s_mov_b32 s5, s8 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX908-NEXT: s_sub_i32 s6, 0, s3 +; GFX908-NEXT: s_sub_i32 s4, 0, s3 ; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s9 ; GFX908-NEXT: v_mov_b32_e32 v19, 0 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -531,32 +530,32 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: s_mul_i32 s6, s6, s10 -; GFX908-NEXT: s_mul_hi_u32 s6, s10, s6 -; GFX908-NEXT: s_add_i32 s10, s10, s6 -; GFX908-NEXT: s_mul_hi_u32 s6, s2, s10 -; GFX908-NEXT: s_mul_i32 s10, s6, s3 +; GFX908-NEXT: s_mul_i32 s4, s4, s10 +; GFX908-NEXT: s_mul_hi_u32 s4, s10, s4 +; GFX908-NEXT: s_add_i32 s10, s10, s4 +; GFX908-NEXT: s_mul_hi_u32 s4, s2, s10 +; GFX908-NEXT: s_mul_i32 s10, s4, s3 ; GFX908-NEXT: s_sub_i32 s2, s2, s10 -; GFX908-NEXT: s_add_i32 s11, s6, 1 +; GFX908-NEXT: s_add_i32 s11, s4, 1 ; GFX908-NEXT: s_sub_i32 s10, s2, s3 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s6, s11, s6 +; GFX908-NEXT: s_cselect_b32 s4, s11, s4 ; GFX908-NEXT: s_cselect_b32 s2, s10, s2 -; GFX908-NEXT: s_add_i32 s10, s6, 1 +; GFX908-NEXT: s_add_i32 s10, s4, 1 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s6, s10, s6 +; GFX908-NEXT: s_cselect_b32 s4, s10, s4 ; GFX908-NEXT: s_lshr_b32 s9, s9, 16 -; GFX908-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 +; GFX908-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 ; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s9 ; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX908-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 +; GFX908-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 ; GFX908-NEXT: s_or_b32 s10, s10, 28 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s7, v16 -; GFX908-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX908-NEXT: s_mul_i32 s1, s1, s7 -; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7 -; GFX908-NEXT: s_mul_i32 s0, s0, s7 +; GFX908-NEXT: v_readfirstlane_b32 s5, v16 +; GFX908-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX908-NEXT: s_mul_i32 s1, s1, s5 +; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5 +; GFX908-NEXT: s_mul_i32 s0, s0, s5 ; GFX908-NEXT: s_add_i32 s1, s9, s1 ; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 @@ -572,7 +571,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 ; GFX908-NEXT: s_mov_b32 s9, s8 ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] ; GFX908-NEXT: v_mov_b32_e32 v4, s8 @@ -582,20 +581,20 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_mov_b32_e32 v5, s9 ; GFX908-NEXT: v_mov_b32_e32 v9, s9 ; GFX908-NEXT: v_mov_b32_e32 v7, s9 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 ; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s7, v2 +; GFX908-NEXT: v_readfirstlane_b32 s5, v2 ; GFX908-NEXT: v_readfirstlane_b32 s9, v3 -; GFX908-NEXT: s_add_u32 s7, s7, 1 +; GFX908-NEXT: s_add_u32 s5, s5, 1 ; GFX908-NEXT: s_addc_u32 s9, s9, 0 -; GFX908-NEXT: s_mul_hi_u32 s20, s2, s7 +; GFX908-NEXT: s_mul_hi_u32 s20, s2, s5 ; GFX908-NEXT: s_mul_i32 s9, s2, s9 -; GFX908-NEXT: s_mul_i32 s21, s3, s7 +; GFX908-NEXT: s_mul_i32 s21, s3, s5 ; GFX908-NEXT: s_add_i32 s9, s20, s9 -; GFX908-NEXT: s_mul_i32 s7, s2, s7 +; GFX908-NEXT: s_mul_i32 s5, s2, s5 ; GFX908-NEXT: s_add_i32 s9, s9, s21 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 @@ -611,7 +610,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s20, s18, s7 +; GFX908-NEXT: s_add_u32 s20, s18, s5 ; GFX908-NEXT: s_addc_u32 s21, s19, s9 ; GFX908-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -671,8 +670,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s4, s4, s6 -; GFX908-NEXT: s_addc_u32 s5, s5, 0 +; GFX908-NEXT: s_add_u32 s6, s6, s4 +; GFX908-NEXT: s_addc_u32 s7, s7, 0 ; GFX908-NEXT: s_add_u32 s10, s10, s12 ; GFX908-NEXT: s_addc_u32 s11, s11, s13 ; GFX908-NEXT: s_mov_b64 s[0:1], 0 @@ -683,14 +682,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX90A-NEXT: s_load_dword s9, s[6:7], 0x18 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18 ; GFX90A-NEXT: s_mov_b32 s8, 0 -; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: s_mov_b32 s5, s8 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s6, 0, s3 +; GFX90A-NEXT: s_sub_i32 s4, 0, s3 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0 @@ -698,32 +697,32 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s9 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v3 -; GFX90A-NEXT: s_mul_i32 s6, s6, s10 -; GFX90A-NEXT: s_mul_hi_u32 s6, s10, s6 -; GFX90A-NEXT: s_add_i32 s10, s10, s6 -; GFX90A-NEXT: s_mul_hi_u32 s6, s2, s10 -; GFX90A-NEXT: s_mul_i32 s10, s6, s3 +; GFX90A-NEXT: s_mul_i32 s4, s4, s10 +; GFX90A-NEXT: s_mul_hi_u32 s4, s10, s4 +; GFX90A-NEXT: s_add_i32 s10, s10, s4 +; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s10 +; GFX90A-NEXT: s_mul_i32 s10, s4, s3 ; GFX90A-NEXT: s_sub_i32 s2, s2, s10 -; GFX90A-NEXT: s_add_i32 s11, s6, 1 +; GFX90A-NEXT: s_add_i32 s11, s4, 1 ; GFX90A-NEXT: s_sub_i32 s10, s2, s3 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s6, s11, s6 +; GFX90A-NEXT: s_cselect_b32 s4, s11, s4 ; GFX90A-NEXT: s_cselect_b32 s2, s10, s2 -; GFX90A-NEXT: s_add_i32 s10, s6, 1 +; GFX90A-NEXT: s_add_i32 s10, s4, 1 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s6, s10, s6 +; GFX90A-NEXT: s_cselect_b32 s4, s10, s4 ; GFX90A-NEXT: s_lshr_b32 s9, s9, 16 -; GFX90A-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 +; GFX90A-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s9 ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 ; GFX90A-NEXT: s_or_b32 s10, s10, 28 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s7, v18 -; GFX90A-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX90A-NEXT: s_mul_i32 s1, s1, s7 -; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s7 -; GFX90A-NEXT: s_mul_i32 s0, s0, s7 +; GFX90A-NEXT: v_readfirstlane_b32 s5, v18 +; GFX90A-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX90A-NEXT: s_mul_i32 s1, s1, s5 +; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5 +; GFX90A-NEXT: s_mul_i32 s0, s0, s5 ; GFX90A-NEXT: s_add_i32 s1, s9, s1 ; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 @@ -739,26 +738,26 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 ; GFX90A-NEXT: s_mov_b32 s9, s8 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 ; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s7, v4 +; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 -; GFX90A-NEXT: s_add_u32 s7, s7, 1 +; GFX90A-NEXT: s_add_u32 s5, s5, 1 ; GFX90A-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-NEXT: s_mul_hi_u32 s20, s2, s7 +; GFX90A-NEXT: s_mul_hi_u32 s20, s2, s5 ; GFX90A-NEXT: s_mul_i32 s9, s2, s9 -; GFX90A-NEXT: s_mul_i32 s21, s3, s7 +; GFX90A-NEXT: s_mul_i32 s21, s3, s5 ; GFX90A-NEXT: s_add_i32 s9, s20, s9 -; GFX90A-NEXT: s_mul_i32 s7, s2, s7 +; GFX90A-NEXT: s_mul_i32 s5, s2, s5 ; GFX90A-NEXT: s_add_i32 s9, s9, s21 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 @@ -774,7 +773,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s20, s18, s7 +; GFX90A-NEXT: s_add_u32 s20, s18, s5 ; GFX90A-NEXT: s_addc_u32 s21, s19, s9 ; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -827,8 +826,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s4, s4, s6 -; GFX90A-NEXT: s_addc_u32 s5, s5, 0 +; GFX90A-NEXT: s_add_u32 s6, s6, s4 +; GFX90A-NEXT: s_addc_u32 s7, s7, 0 ; GFX90A-NEXT: s_add_u32 s10, s10, s12 ; GFX90A-NEXT: s_addc_u32 s11, s11, s13 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll index 8d87b53efb4e73..bd5dc6e2070986 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -175,4 +175,4 @@ bb: ret void } -attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index 0a461f9ee6c968..0c5e1ec0d5b6f1 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapture readonly, ptr addrspace(1) noalias nocapture readonly) { ; GCN-LABEL: readfirstlane_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll index def6df9adf5977..330cf48803680d 100644 --- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll +++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll @@ -41,16 +41,16 @@ define void @test1() { define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; GFX9-LABEL: test2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lt_i32 s0, 1 +; GFX9-NEXT: s_cmp_lt_i32 s2, 1 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB2_2: ; %then @@ -58,16 +58,16 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; ; GFX10-LABEL: test2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lt_i32 s0, 1 +; GFX10-NEXT: s_cmp_lt_i32 s2, 1 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; GFX10-NEXT: .LBB2_2: ; %then @@ -75,15 +75,15 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; ; GFX11-LABEL: test2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lt_i32 s0, 1 +; GFX11-NEXT: s_cmp_lt_i32 s2, 1 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB2_2: ; %then diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index bf72cccd912cee..cb59121d697083 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -392,7 +392,7 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { ; ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x83 ; GCN-NEXT: v_mov_b32_e32 v1, 0x80 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 7cf18171a6cd74..8144fb7a3b6461 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -39,7 +39,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: udiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -72,7 +72,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -137,7 +137,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: urem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -167,7 +167,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -241,7 +241,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: sdiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -280,7 +280,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_abs_i32 s4, s3 @@ -359,7 +359,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: srem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -394,7 +394,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_abs_i32 s3, s3 @@ -452,15 +452,15 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: udiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_lshr_b32 s3, s2, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 @@ -468,20 +468,19 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -489,7 +488,6 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i16 %x, %y @@ -523,37 +521,36 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: urem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s5, s4, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_and_b32 s0, s4, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_lshr_b32 s2, s4, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_and_b32 s3, s4, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -562,8 +559,8 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -600,8 +597,8 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: sdiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -626,27 +623,27 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_ashr_i32 s0, s4, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i16 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 -; GFX9-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX9-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i16 %x, %y store i16 %r, ptr addrspace(1) %out @@ -683,8 +680,8 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: srem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s5, s4, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 @@ -711,8 +708,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: srem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s4, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 @@ -722,6 +718,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -733,6 +730,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i16 %x, %y @@ -764,8 +762,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: udiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -783,13 +781,13 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 @@ -829,8 +827,8 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: urem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 @@ -851,13 +849,13 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: urem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 -; GFX9-NEXT: s_lshr_b32 s2, s4, 8 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -865,8 +863,9 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i8 %x, %y @@ -902,8 +901,8 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: sdiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -928,27 +927,27 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i8 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i8 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 -; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i8 %x, %y store i8 %r, ptr addrspace(1) %out @@ -985,8 +984,8 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: srem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x80008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -1014,30 +1013,30 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: srem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i8 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i8 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_or_b32 s6, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = srem i8 %x, %y store i8 %r, ptr addrspace(1) %out @@ -1179,13 +1178,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: udiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: s_sub_i32 s0, 0, s12 +; GFX6-NEXT: s_sub_i32 s2, 0, s12 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s14 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1195,28 +1194,28 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s12 -; GFX6-NEXT: s_sub_i32 s0, s8, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s12 -; GFX6-NEXT: s_cmp_ge_u32 s0, s12 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s12 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s12 +; GFX6-NEXT: s_cmp_ge_u32 s2, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s12 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s2, s12 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_sub_i32 s4, 0, s13 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 @@ -1277,9 +1276,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: udiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1499,36 +1498,34 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: urem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_sub_i32 s2, 0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s8 -; GFX6-NEXT: s_sub_i32 s0, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, s1, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s8 +; GFX6-NEXT: s_sub_i32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s4, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 @@ -1536,58 +1533,60 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s9 -; GFX6-NEXT: s_sub_i32 s1, s5, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, 0, s10 -; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s9 +; GFX6-NEXT: s_sub_i32 s2, s5, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s5, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s10 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s10 -; GFX6-NEXT: s_sub_i32 s4, s6, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, 0, s11 -; GFX6-NEXT: v_mul_lo_u32 v0, s5, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s10 +; GFX6-NEXT: s_sub_i32 s2, s6, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s6, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s11 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v2 -; GFX6-NEXT: s_mul_i32 s0, s0, s11 -; GFX6-NEXT: s_sub_i32 s0, s7, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: s_mul_i32 s4, s4, s11 +; GFX6-NEXT: s_sub_i32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1843,34 +1842,34 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s0, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_abs_i32 s2, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_xor_b32 s4, s8, s12 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX6-NEXT: s_abs_i32 s1, s8 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_abs_i32 s3, s8 ; GFX6-NEXT: s_ashr_i32 s8, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s4 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_mul_i32 s4, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_abs_i32 s4, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GFX6-NEXT: s_sub_i32 s5, 0, s4 @@ -1878,7 +1877,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_xor_b32 s6, s9, s13 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 @@ -1965,16 +1964,17 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_xor_b32 s1, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, 0, s0 +; GFX9-NEXT: s_abs_i32 s2, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_xor_b32 s3, s4, s8 +; GFX9-NEXT: s_sub_i32 s8, 0, s2 ; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s1, s1, 31 +; GFX9-NEXT: s_ashr_i32 s3, s3, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s12, v0 @@ -1982,82 +1982,81 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s8 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s12, s8, s0 +; GFX9-NEXT: s_mul_i32 s12, s8, s2 ; GFX9-NEXT: s_sub_i32 s4, s4, s12 ; GFX9-NEXT: s_add_i32 s13, s8, 1 -; GFX9-NEXT: s_sub_i32 s12, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_sub_i32 s12, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s8, s13, s8 ; GFX9-NEXT: s_cselect_b32 s4, s12, s4 ; GFX9-NEXT: s_add_i32 s12, s8, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s0, s12, s8 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s12, s8 ; GFX9-NEXT: s_abs_i32 s4, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: s_xor_b32 s8, s5, s9 ; GFX9-NEXT: s_sub_i32 s9, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s12, s0, s1 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: s_ashr_i32 s8, s8, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s0 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s1, s0, s4 -; GFX9-NEXT: s_sub_i32 s1, s5, s1 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s5, s1, s4 -; GFX9-NEXT: s_cmp_ge_u32 s1, s4 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 -; GFX9-NEXT: s_add_i32 s5, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s4 -; GFX9-NEXT: s_cselect_b32 s0, s5, s0 -; GFX9-NEXT: s_abs_i32 s1, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s6, s10 -; GFX9-NEXT: s_abs_i32 s5, s6 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s3 +; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 +; GFX9-NEXT: s_add_i32 s3, s3, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s9, s3, s4 +; GFX9-NEXT: s_sub_i32 s5, s5, s9 +; GFX9-NEXT: s_add_i32 s12, s3, 1 +; GFX9-NEXT: s_sub_i32 s9, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s12, s3 +; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_add_i32 s9, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s9, s3 +; GFX9-NEXT: s_abs_i32 s4, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s8 +; GFX9-NEXT: s_sub_i32 s9, 0, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s1 -; GFX9-NEXT: s_sub_i32 s8, s0, s8 -; GFX9-NEXT: s_ashr_i32 s4, s4, 31 +; GFX9-NEXT: s_xor_b32 s5, s6, s10 +; GFX9-NEXT: s_abs_i32 s6, s6 +; GFX9-NEXT: s_ashr_i32 s5, s5, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s0 -; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s6, s0, s1 -; GFX9-NEXT: s_sub_i32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s6, s5, s1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s5, s6, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 +; GFX9-NEXT: s_mul_i32 s9, s8, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s9 +; GFX9-NEXT: s_add_i32 s10, s8, 1 +; GFX9-NEXT: s_sub_i32 s9, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s8, s10, s8 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_add_i32 s9, s8, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s4, s9, s8 ; GFX9-NEXT: s_abs_i32 s6, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_xor_b32 s2, s7, s11 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_abs_i32 s3, s7 ; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_ashr_i32 s2, s2, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: s_mul_i32 s7, s7, s5 ; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 @@ -2077,7 +2076,6 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i32> %x, %y @@ -2244,34 +2242,35 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: srem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_abs_i32 s2, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX6-NEXT: s_abs_i32 s1, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_abs_i32 s3, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: s_mul_i32 s8, s8, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s8 -; GFX6-NEXT: s_sub_i32 s8, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s8, s1 -; GFX6-NEXT: s_sub_i32 s8, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s0, s8, s1 -; GFX6-NEXT: s_abs_i32 s1, s9 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_sub_i32 s8, 0, s1 -; GFX6-NEXT: s_xor_b32 s0, s0, s4 -; GFX6-NEXT: s_sub_i32 s0, s0, s4 +; GFX6-NEXT: s_mul_i32 s8, s8, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s8 +; GFX6-NEXT: s_sub_i32 s8, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s8, s3 +; GFX6-NEXT: s_sub_i32 s8, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s2, s8, s3 +; GFX6-NEXT: s_abs_i32 s3, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s8, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s4, s2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2281,22 +2280,21 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 -; GFX6-NEXT: s_sub_i32 s8, s4, s1 -; GFX6-NEXT: s_cmp_ge_u32 s4, s1 -; GFX6-NEXT: s_cselect_b32 s4, s8, s4 -; GFX6-NEXT: s_sub_i32 s8, s4, s1 -; GFX6-NEXT: s_cmp_ge_u32 s4, s1 -; GFX6-NEXT: s_cselect_b32 s1, s8, s4 -; GFX6-NEXT: s_abs_i32 s4, s10 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX6-NEXT: s_sub_i32 s8, 0, s4 -; GFX6-NEXT: s_xor_b32 s1, s1, s5 -; GFX6-NEXT: s_sub_i32 s1, s1, s5 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_abs_i32 s3, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s8, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, s5 +; GFX6-NEXT: s_sub_i32 s5, s2, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s8, v0 @@ -2305,59 +2303,59 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s5, v0 -; GFX6-NEXT: s_mul_i32 s5, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_sub_i32 s8, s5, s4 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s5, s8, s5 -; GFX6-NEXT: s_sub_i32 s8, s5, s4 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s4, s8, s5 -; GFX6-NEXT: s_abs_i32 s5, s11 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_sub_i32 s8, 0, s5 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s8, s8, s2 +; GFX6-NEXT: s_abs_i32 s9, s11 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX6-NEXT: s_sub_i32 s2, 0, s9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_abs_i32 s0, s7 -; GFX6-NEXT: v_mul_lo_u32 v1, s8, v2 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; GFX6-NEXT: s_xor_b32 s2, s4, s6 -; GFX6-NEXT: s_sub_i32 s2, s2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_abs_i32 s4, s7 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v2 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_ashr_i32 s1, s7, 31 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_ashr_i32 s5, s7, 31 +; GFX6-NEXT: s_xor_b32 s7, s8, s6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v2, s0, v2 -; GFX6-NEXT: v_readfirstlane_b32 s3, v2 -; GFX6-NEXT: s_mul_i32 s3, s3, s5 -; GFX6-NEXT: s_sub_i32 s0, s0, s3 -; GFX6-NEXT: s_sub_i32 s3, s0, s5 -; GFX6-NEXT: s_cmp_ge_u32 s0, s5 -; GFX6-NEXT: s_cselect_b32 s0, s3, s0 -; GFX6-NEXT: s_sub_i32 s3, s0, s5 -; GFX6-NEXT: s_cmp_ge_u32 s0, s5 -; GFX6-NEXT: s_cselect_b32 s0, s3, s0 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NEXT: v_mul_hi_u32 v2, s4, v2 +; GFX6-NEXT: s_sub_i32 s6, s7, s6 +; GFX6-NEXT: v_readfirstlane_b32 s7, v2 +; GFX6-NEXT: s_mul_i32 s7, s7, s9 +; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_sub_i32 s7, s4, s9 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s7, s4, s9 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_sub_i32 s8, 0, s0 -; GFX9-NEXT: s_ashr_i32 s1, s4, 31 +; GFX9-NEXT: s_abs_i32 s2, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s8, 0, s2 +; GFX9-NEXT: s_ashr_i32 s3, s4, 31 ; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2367,73 +2365,72 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s8 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s8, s8, s0 +; GFX9-NEXT: s_mul_i32 s8, s8, s2 ; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_sub_i32 s8, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_sub_i32 s8, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s0, s8, s4 +; GFX9-NEXT: s_sub_i32 s8, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s8, s4 ; GFX9-NEXT: s_abs_i32 s4, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: s_sub_i32 s9, 0, s4 -; GFX9-NEXT: s_sub_i32 s12, s0, s1 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s0 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s4 -; GFX9-NEXT: s_cmp_ge_u32 s0, s4 -; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s4 -; GFX9-NEXT: s_cmp_ge_u32 s0, s4 -; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: s_abs_i32 s1, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s8 -; GFX9-NEXT: s_ashr_i32 s4, s6, 31 -; GFX9-NEXT: s_abs_i32 s5, s6 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s3 +; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 +; GFX9-NEXT: s_add_i32 s3, s3, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_abs_i32 s4, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s8 +; GFX9-NEXT: s_sub_i32 s9, 0, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s1 -; GFX9-NEXT: s_sub_i32 s8, s0, s8 +; GFX9-NEXT: s_ashr_i32 s5, s6, 31 +; GFX9-NEXT: s_abs_i32 s6, s6 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s0 -; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s5, s0, s1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s5, s0, s1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s5, s5, s0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 +; GFX9-NEXT: s_mul_i32 s8, s8, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: s_sub_i32 s8, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 +; GFX9-NEXT: s_sub_i32 s8, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s4, s8, s6 ; GFX9-NEXT: s_abs_i32 s6, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_ashr_i32 s2, s7, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_abs_i32 s3, s7 ; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: s_mul_i32 s7, s7, s5 ; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 @@ -2451,7 +2448,6 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <4 x i32> %x, %y @@ -2546,8 +2542,8 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: udiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2606,21 +2602,21 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 @@ -2658,7 +2654,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y @@ -2761,8 +2756,8 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: urem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2829,34 +2824,35 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_and_b32 s8, s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: s_and_b32 s8, s5, 0xffff ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc @@ -2871,21 +2867,20 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 -; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s3, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 @@ -2999,8 +2994,8 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3079,79 +3074,79 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_sext_i32_i16 s0, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i16 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s8, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: s_ashr_i32 s3, s6, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v3 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 ; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 -; GFX9-NEXT: s_xor_b32 s2, s4, s3 +; GFX9-NEXT: s_xor_b32 s0, s4, s1 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: s_or_b32 s4, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s3, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX9-NEXT: s_sext_i32_i16 s1, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 +; GFX9-NEXT: s_sext_i32_i16 s0, s5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: s_ashr_i32 s3, s7, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: v_add_u32_e32 v1, s2, v5 -; GFX9-NEXT: s_ashr_i32 s2, s5, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 +; GFX9-NEXT: s_ashr_i32 s0, s5, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v6 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3269,8 +3264,8 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: srem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3361,78 +3356,78 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GFX9-NEXT: s_sext_i32_i16 s9, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX9-NEXT: s_xor_b32 s2, s9, s8 +; GFX9-NEXT: s_xor_b32 s0, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s10, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s10, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s10, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s10, 0 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v1, s2, v3 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s2, s4, s6 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_xor_b32 s0, s4, s6 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: s_or_b32 s8, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| +; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 ; GFX9-NEXT: s_sext_i32_i16 s8, s7 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v4 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX9-NEXT: s_sext_i32_i16 s6, s5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_xor_b32 s2, s6, s8 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s10, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s6, s8 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s10, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s10, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s10, 0 ; GFX9-NEXT: s_ashr_i32 s7, s7, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s7 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v5 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_xor_b32 s2, s5, s7 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_xor_b32 s0, s5, s7 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s8 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_or_b32 s8, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v6 +; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s7 ; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s6, v3 @@ -3441,7 +3436,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %r = srem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3472,8 +3467,8 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: udiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 @@ -3494,15 +3489,15 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: udiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s2, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s2, s4, 7 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 +; GFX9-NEXT: s_and_b32 s0, s4, 7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 @@ -3510,7 +3505,7 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX9-NEXT: global_store_byte v2, v0, s[0:1] +; GFX9-NEXT: global_store_byte v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv i3 %x, %y store i3 %r, ptr addrspace(1) %out @@ -3543,8 +3538,8 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: urem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -3568,24 +3563,24 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: urem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s1, s4, 7 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_and_b32 s4, s2, 7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] @@ -3623,8 +3618,8 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: sdiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3650,28 +3645,28 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: sdiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i3 %x, %y store i3 %r, ptr addrspace(1) %out @@ -3708,8 +3703,8 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: srem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x30008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -3738,27 +3733,27 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: srem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s0, 1 +; GFX9-NEXT: s_or_b32 s6, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 @@ -3837,8 +3832,8 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: udiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3884,21 +3879,21 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 @@ -3923,7 +3918,6 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v6, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4005,8 +3999,8 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: urem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4058,33 +4052,33 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s6, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_and_b32 s8, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 -; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc @@ -4093,16 +4087,17 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4190,8 +4185,8 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4252,62 +4247,62 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_sext_i32_i16 s0, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i16 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s8, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: s_ashr_i32 s3, s6, 16 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v2, s2, v3 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s2, s4, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s4, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s3, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 +; GFX9-NEXT: s_sext_i32_i16 s1, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 +; GFX9-NEXT: s_sext_i32_i16 s0, s5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_short v1, v0, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v1, v2, s[0:1] +; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v1, v2, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4399,8 +4394,8 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: srem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4469,8 +4464,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 @@ -4522,6 +4516,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cselect_b32 s2, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -4529,6 +4524,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4604,31 +4600,33 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: udiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s2, s10, 0x7fff -; GFX6-NEXT: s_and_b32 s3, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_and_b32 s5, s8, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_and_b32 s4, s6, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_bfe_u32 s4, s8, 0xf000f ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX6-NEXT: s_bfe_u32 s3, s10, 0xf000f -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX6-NEXT: s_bfe_u32 s5, s6, 0xf000f +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -4651,33 +4649,31 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff -; GFX9-NEXT: s_and_b32 s3, s0, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_and_b32 s0, s6, 0x7fff +; GFX9-NEXT: s_and_b32 s1, s2, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX9-NEXT: s_bfe_u32 s0, s2, 0xf000f ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f -; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_alignbit_b32 v3, s3, v3, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -4791,41 +4787,41 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: urem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_and_b32 s8, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX6-NEXT: s_and_b32 s3, s10, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_and_b32 s7, s8, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX6-NEXT: s_bfe_u32 s5, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX6-NEXT: s_bfe_u32 s7, s6, 0xf000f ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: s_bfe_u32 s8, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s10, v1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s6, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 @@ -4834,32 +4830,32 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 -; GFX6-NEXT: s_lshr_b32 s0, s0, 15 +; GFX6-NEXT: s_lshr_b32 s5, s8, 15 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: s_lshr_b32 s2, s10, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_lshr_b32 s4, s6, 15 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 @@ -5000,50 +4996,52 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: s_bfe_i32 s2, s0, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s1, s1, s2 -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 -; GFX6-NEXT: s_or_b32 s1, s1, 1 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| -; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_or_b32 s7, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_cselect_b32 s1, s1, 0 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s1, v4 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX6-NEXT: s_cselect_b32 s4, s7, 0 +; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v4 +; GFX6-NEXT: s_bfe_i32 s4, s6, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_alignbit_b32 v1, s9, v1, 30 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 -; GFX6-NEXT: s_or_b32 s2, s0, 1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v2| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX6-NEXT: s_cselect_b32 s0, s2, 0 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -5061,46 +5059,43 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_bfe_i32 s2, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_xor_b32 s1, s1, s2 -; GFX9-NEXT: s_ashr_i32 s1, s1, 30 -; GFX9-NEXT: s_or_b32 s1, s1, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 +; GFX9-NEXT: s_or_b32 s3, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s1, s1, 0 -; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: v_add_u32_e32 v4, s1, v5 -; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: s_cselect_b32 s0, s3, 0 +; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -5110,6 +5105,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s2, 0 @@ -5227,73 +5223,73 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: srem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GFX6-NEXT: s_bfe_i32 s1, s0, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s2 -; GFX6-NEXT: s_xor_b32 s1, s2, s1 -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 -; GFX6-NEXT: s_lshr_b32 s8, s10, 15 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v6, v6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX6-NEXT: s_lshr_b32 s9, s0, 15 -; GFX6-NEXT: s_or_b32 s1, s1, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX6-NEXT: s_cselect_b32 s1, s1, 0 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s1, v6 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s1 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_lshr_b32 s7, s6, 15 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 +; GFX6-NEXT: s_lshr_b32 s9, s8, 15 +; GFX6-NEXT: s_or_b32 s10, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, |v4| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s10, 0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v6 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s4 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s5 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v7, v7 ; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 -; GFX6-NEXT: s_or_b32 s2, s0, 1 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s6, v4 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX6-NEXT: s_cselect_b32 s0, s2, 0 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, s0, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 @@ -5301,54 +5297,54 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s6, 0xf0000 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 -; GFX9-NEXT: s_xor_b32 s1, s2, s1 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_ashr_i32 s1, s1, 30 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s8, s6, 15 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_lshr_b32 s7, s0, 15 -; GFX9-NEXT: s_or_b32 s1, s1, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s1, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s1, v6 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 +; GFX9-NEXT: s_lshr_b32 s3, s2, 15 +; GFX9-NEXT: s_or_b32 s7, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s7, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 ; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 @@ -5367,7 +5363,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v5, s7 +; GFX9-NEXT: v_mul_lo_u32 v5, v5, s3 ; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -5397,8 +5393,8 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -5413,17 +5409,17 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: udiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s3, s4, s2 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 20 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s1, s4, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s1, 20 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -5438,8 +5434,8 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5450,13 +5446,13 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: udiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 12 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 12 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -5472,7 +5468,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: udiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5486,7 +5482,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s3, s3, 12 @@ -5513,7 +5509,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: udiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5528,7 +5524,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 12 @@ -5555,7 +5551,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -5574,7 +5570,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101 @@ -5664,42 +5660,42 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_sub_i32 s4, 0, s6 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1 @@ -5720,54 +5716,54 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 -; GFX9-NEXT: s_mul_i32 s3, s2, s6 -; GFX9-NEXT: s_sub_i32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s9, s2, 1 -; GFX9-NEXT: s_sub_i32 s4, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s2, s9, s2 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s4, s2, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 +; GFX9-NEXT: s_mul_i32 s7, s6, s3 +; GFX9-NEXT: s_sub_i32 s4, s4, s7 +; GFX9-NEXT: s_add_i32 s9, s6, 1 +; GFX9-NEXT: s_sub_i32 s7, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_add_i32 s7, s6, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s7 -; GFX9-NEXT: s_mul_i32 s3, s3, s8 -; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s8, s8, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 -; GFX9-NEXT: s_mul_i32 s4, s3, s7 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s6, s3, 1 -; GFX9-NEXT: s_sub_i32 s5, s4, s7 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s3, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cselect_b32 s3, s7, s6 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 +; GFX9-NEXT: s_mul_i32 s6, s4, s2 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_add_i32 s7, s4, 1 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_cselect_b32 s2, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -5784,10 +5780,10 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -5803,19 +5799,19 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: urem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s3, s4, s2 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 20 -; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s1, s4, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s1, 20 +; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -5830,8 +5826,8 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5842,13 +5838,13 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: urem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -5864,7 +5860,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: urem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5879,7 +5875,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 @@ -5907,7 +5903,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: urem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5922,7 +5918,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xfff @@ -6004,35 +6000,35 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s4, s4, s1 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 -; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s4, s4, s3 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 @@ -6049,56 +6045,55 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cselect_b32 s5, s7, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 -; GFX9-NEXT: s_mul_i32 s2, s2, s6 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s6 -; GFX9-NEXT: s_cmp_ge_u32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s6 -; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s3 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_sub_i32 s6, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_sub_i32 s6, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s7 -; GFX9-NEXT: s_mul_i32 s3, s3, s8 -; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s8, s8, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 -; GFX9-NEXT: s_mul_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s4, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_sub_i32 s4, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cselect_b32 s3, s6, s4 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 +; GFX9-NEXT: s_mul_i32 s4, s4, s2 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6115,8 +6110,8 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -6131,17 +6126,17 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: sdiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 31 -; GFX9-NEXT: s_ashr_i32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s1, s0, 31 +; GFX9-NEXT: s_ashr_i32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -6156,8 +6151,8 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6171,16 +6166,16 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: sdiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s4, s4, s2 -; GFX9-NEXT: s_ashr_i32 s2, s4, 12 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s4, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s4, s4, s0 +; GFX9-NEXT: s_ashr_i32 s0, s4, 12 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -6196,7 +6191,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: sdiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6239,7 +6234,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 @@ -6294,7 +6289,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: sdiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6315,7 +6310,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s2, 31 @@ -6348,7 +6343,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -6370,7 +6365,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s2, 31 @@ -6481,50 +6476,50 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s1, s0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_sub_i32 s6, 0, s1 -; GFX6-NEXT: s_xor_b32 s0, s4, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s3, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s6, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s4, s2 ; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX6-NEXT: s_abs_i32 s6, s4 -; GFX6-NEXT: s_ashr_i32 s4, s0, 31 +; GFX6-NEXT: s_ashr_i32 s4, s2, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s6, s0, s1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s6, s2 +; GFX6-NEXT: s_sub_i32 s6, s2, s3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_cselect_b32 s2, s6, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: s_abs_i32 s6, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_xor_b32 s7, s5, s7 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_abs_i32 s5, s5 ; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_ashr_i32 s7, s7, 31 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 @@ -6544,73 +6539,71 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s7, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s1, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s4, s0 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX9-NEXT: s_abs_i32 s7, s4 +; GFX9-NEXT: s_xor_b32 s2, s4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s4, s0, 31 -; GFX9-NEXT: s_sub_i32 s0, 0, s1 +; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s8 -; GFX9-NEXT: s_mul_hi_u32 s0, s8, s0 -; GFX9-NEXT: s_add_i32 s8, s8, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s7, s8 -; GFX9-NEXT: s_mul_i32 s8, s0, s1 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s7, s8 +; GFX9-NEXT: s_mul_i32 s8, s4, s3 ; GFX9-NEXT: s_sub_i32 s7, s7, s8 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s8, s7, s1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 +; GFX9-NEXT: s_add_i32 s9, s4, 1 +; GFX9-NEXT: s_sub_i32 s8, s7, s3 +; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 ; GFX9-NEXT: s_cselect_b32 s7, s8, s7 -; GFX9-NEXT: s_add_i32 s8, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s7, s8, s0 -; GFX9-NEXT: s_abs_i32 s8, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_xor_b32 s2, s5, s6 -; GFX9-NEXT: s_abs_i32 s3, s5 +; GFX9-NEXT: s_add_i32 s8, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s3, s8, s4 +; GFX9-NEXT: s_abs_i32 s4, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s7, 0, s4 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s7, s4 -; GFX9-NEXT: s_sub_i32 s6, 0, s8 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_xor_b32 s6, s5, s6 +; GFX9-NEXT: s_abs_i32 s5, s5 +; GFX9-NEXT: s_ashr_i32 s6, s6, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 31 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX9-NEXT: s_mul_i32 s6, s5, s8 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_sub_i32 s6, s3, s8 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s7, s7, s3 +; GFX9-NEXT: s_mul_hi_u32 s7, s3, s7 +; GFX9-NEXT: s_add_i32 s3, s3, s7 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s7, s3, s4 +; GFX9-NEXT: s_sub_i32 s5, s5, s7 +; GFX9-NEXT: s_add_i32 s8, s3, 1 +; GFX9-NEXT: s_sub_i32 s7, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s6, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s6, s5 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s7, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s7, s3 +; GFX9-NEXT: s_xor_b32 s3, s3, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6627,9 +6620,9 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6647,19 +6640,19 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: srem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 31 -; GFX9-NEXT: s_ashr_i32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s1, s0, 31 +; GFX9-NEXT: s_ashr_i32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -6674,8 +6667,8 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6690,17 +6683,17 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: srem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s4, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s0, s4, s0 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -6716,7 +6709,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: srem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 @@ -6753,7 +6746,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 @@ -6805,7 +6798,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: srem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6828,7 +6821,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s2, 31 @@ -6934,44 +6927,44 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s0, s0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s2, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX6-NEXT: s_abs_i32 s1, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_abs_i32 s3, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s7 -; GFX6-NEXT: s_sub_i32 s7, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s7, s1 -; GFX6-NEXT: s_sub_i32 s7, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s7, s7, s1 +; GFX6-NEXT: s_mul_i32 s7, s7, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s7 +; GFX6-NEXT: s_sub_i32 s7, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s7, s3 +; GFX6-NEXT: s_sub_i32 s7, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s7, s7, s3 ; GFX6-NEXT: s_abs_i32 s6, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: s_abs_i32 s8, s5 ; GFX6-NEXT: s_xor_b32 s7, s7, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_sub_i32 s4, s7, s4 ; GFX6-NEXT: s_ashr_i32 s5, s5, 31 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -6989,20 +6982,20 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_sub_i32 s5, s6, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s0, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_lshl_b32 s1, 0x1000, s7 -; GFX9-NEXT: s_sub_i32 s7, 0, s0 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s7 +; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_abs_i32 s4, s4 @@ -7013,43 +7006,41 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 ; GFX9-NEXT: s_add_i32 s8, s8, s7 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 -; GFX9-NEXT: s_mul_i32 s7, s7, s0 +; GFX9-NEXT: s_mul_i32 s7, s7, s2 ; GFX9-NEXT: s_sub_i32 s4, s4, s7 -; GFX9-NEXT: s_sub_i32 s7, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_sub_i32 s7, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_sub_i32 s7, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_abs_i32 s7, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_sub_i32 s7, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s7, s4 +; GFX9-NEXT: s_abs_i32 s3, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s6 +; GFX9-NEXT: s_sub_i32 s7, 0, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_abs_i32 s3, s5 -; GFX9-NEXT: s_sub_i32 s5, 0, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_ashr_i32 s4, s5, 31 +; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s7 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s5, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s5, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 +; GFX9-NEXT: s_mul_i32 s6, s6, s3 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s6, s5, s3 +; GFX9-NEXT: s_cmp_ge_u32 s5, s3 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s3 +; GFX9-NEXT: s_cmp_ge_u32 s5, s3 +; GFX9-NEXT: s_cselect_b32 s3, s6, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -7078,7 +7069,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s4 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 ; GFX6-NEXT: s_mul_i32 s6, s5, 0x68958c89 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -7163,11 +7154,11 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_oddk_denom: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 s0, 3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xe3e0f6 ; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: s_addc_u32 s0, s1, 0 @@ -7276,7 +7267,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: udiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7290,7 +7281,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 @@ -7312,8 +7303,8 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: udiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7328,12 +7319,12 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: udiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, 12 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 +; GFX9-NEXT: s_add_i32 s2, s2, 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -7357,8 +7348,8 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: udiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7373,17 +7364,17 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: udiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7403,8 +7394,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s2, 0x2ff2fc01 ; GFX6-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7490,13 +7481,13 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x2ff2fc01 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 ; GFX9-NEXT: s_add_u32 s4, 0xe037f, s8 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 @@ -7579,9 +7570,9 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7604,27 +7595,27 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s8, 12 -; GFX6-NEXT: s_add_i32 s2, s10, 12 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: s_add_i32 s8, s8, 12 +; GFX6-NEXT: s_add_i32 s9, s10, 12 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s8, 12 ; GFX9-NEXT: s_add_i32 s8, s10, 12 @@ -7650,12 +7641,12 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_oddk_denom: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_add_u32 s0, 4, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xe3e0fc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_addc_u32 s1, 0, 0 ; GFX6-NEXT: s_or_b32 s0, vcc_lo, vcc_hi -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_mov_b32 s0, 0x689e0837 ; GFX6-NEXT: s_movk_i32 s2, 0xfee0 @@ -7746,11 +7737,11 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_oddk_denom: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 s0, 4, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xe3e0fc ; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: s_addc_u32 s0, s1, 0 @@ -7857,7 +7848,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 @@ -7871,7 +7862,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xfff @@ -7892,8 +7883,8 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: urem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7910,11 +7901,11 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: urem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s2 ; GFX9-NEXT: s_add_u32 s0, s0, -1 ; GFX9-NEXT: s_addc_u32 s1, s1, -1 ; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] @@ -7941,8 +7932,8 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: urem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -7957,16 +7948,16 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: urem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0xfff -; GFX9-NEXT: s_and_b32 s3, s6, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0xfff +; GFX9-NEXT: s_and_b32 s1, s6, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = urem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7989,31 +7980,31 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s10 -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 -; GFX6-NEXT: s_add_u32 s2, s2, -1 -; GFX6-NEXT: s_addc_u32 s3, s3, -1 -; GFX6-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; GFX6-NEXT: s_add_u32 s0, s0, -1 -; GFX6-NEXT: s_addc_u32 s1, s1, -1 -; GFX6-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 +; GFX6-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 +; GFX6-NEXT: s_add_u32 s8, s8, -1 +; GFX6-NEXT: s_addc_u32 s9, s9, -1 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_add_u32 s8, s10, -1 +; GFX6-NEXT: s_addc_u32 s9, s11, -1 +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s10 ; GFX9-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 @@ -8043,7 +8034,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x33fe64 ; GFX6-NEXT: s_add_u32 s1, 0x396, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x28100000 @@ -8159,7 +8150,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX9-NEXT: s_mul_i32 s10, s4, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 ; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 ; GFX9-NEXT: s_addc_u32 s6, s8, s9 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 @@ -8243,7 +8234,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8261,7 +8252,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: sdiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -8287,21 +8278,21 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX6-NEXT: s_ashr_i32 s8, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s8 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s1, s1, s8 -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] +; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX6-NEXT: s_sub_u32 s4, 0, s10 ; GFX6-NEXT: s_subb_u32 s5, 0, s11 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8423,19 +8414,19 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_sub_u32 s0, 0, s10 -; GFX9-NEXT: s_subb_u32 s1, 0, s11 +; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s2 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_add_u32 s4, s4, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s5, s5, s2 +; GFX9-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_sub_u32 s0, 0, s8 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8445,60 +8436,61 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 -; GFX9-NEXT: s_mul_i32 s12, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s14, s0, s3 -; GFX9-NEXT: s_mul_i32 s13, s1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s11, v1 +; GFX9-NEXT: s_mul_i32 s12, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11 +; GFX9-NEXT: s_mul_i32 s13, s1, s11 ; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s0, s3 +; GFX9-NEXT: s_mul_i32 s15, s0, s11 ; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s3, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s3, s12 -; GFX9-NEXT: s_mul_i32 s3, s3, s12 -; GFX9-NEXT: s_add_u32 s3, s14, s3 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 +; GFX9-NEXT: s_mul_i32 s11, s11, s12 +; GFX9-NEXT: s_add_u32 s11, s14, s11 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s2, s15 -; GFX9-NEXT: s_mul_i32 s15, s2, s15 -; GFX9-NEXT: s_add_u32 s3, s3, s15 -; GFX9-NEXT: s_mul_hi_u32 s14, s2, s12 -; GFX9-NEXT: s_addc_u32 s3, s13, s16 +; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15 +; GFX9-NEXT: s_mul_i32 s15, s10, s15 +; GFX9-NEXT: s_add_u32 s11, s11, s15 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s12 +; GFX9-NEXT: s_addc_u32 s11, s13, s16 ; GFX9-NEXT: s_addc_u32 s13, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s2, s12 -; GFX9-NEXT: s_add_u32 s3, s3, s12 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s11, s11, s12 ; GFX9-NEXT: s_addc_u32 s12, 0, s13 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s11, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s12 +; GFX9-NEXT: s_addc_u32 s10, s10, s12 ; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: s_mul_i32 s3, s0, s2 +; GFX9-NEXT: s_mul_i32 s11, s0, s10 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s3, s13, s3 +; GFX9-NEXT: s_add_i32 s11, s13, s11 ; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_add_i32 s11, s11, s1 ; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s2, s0 -; GFX9-NEXT: s_mul_i32 s14, s2, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s3 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 +; GFX9-NEXT: s_mul_i32 s14, s10, s0 +; GFX9-NEXT: s_mul_i32 s16, s12, s11 ; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s3 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 ; GFX9-NEXT: s_add_u32 s0, s0, s16 ; GFX9-NEXT: s_addc_u32 s12, 0, s15 ; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 ; GFX9-NEXT: s_addc_u32 s0, s12, s13 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s3, s2, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s3 +; GFX9-NEXT: s_mul_i32 s11, s10, s11 +; GFX9-NEXT: s_add_u32 s0, s0, s11 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s12, s2, s1 -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s7, s2 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] +; GFX9-NEXT: s_addc_u32 s12, s10, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s10, s7, 31 +; GFX9-NEXT: s_add_u32 s0, s6, s10 +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: s_addc_u32 s1, s7, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX9-NEXT: v_readfirstlane_b32 s13, v1 ; GFX9-NEXT: s_mul_i32 s1, s6, s12 ; GFX9-NEXT: s_mul_hi_u32 s14, s6, s13 @@ -8514,24 +8506,24 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_i32 s12, s7, s12 ; GFX9-NEXT: s_add_u32 s12, s0, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s1, s10, s12 +; GFX9-NEXT: s_mul_i32 s0, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s12 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s11, s12 +; GFX9-NEXT: s_mul_i32 s1, s9, s12 ; GFX9-NEXT: s_add_i32 s14, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s10, s12 +; GFX9-NEXT: s_mul_i32 s1, s8, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_sub_i32 s0, s7, s14 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s6, s0, s11 -; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s10, v1 +; GFX9-NEXT: s_subb_u32 s6, s0, s9 +; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s8, v1 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s6, s6, 0 -; GFX9-NEXT: s_cmp_ge_u32 s6, s11 +; GFX9-NEXT: s_cmp_ge_u32 s6, s9 ; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 -; GFX9-NEXT: s_cmp_eq_u32 s6, s11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 +; GFX9-NEXT: s_cmp_eq_u32 s6, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -8549,10 +8541,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: s_subb_u32 s0, s7, s14 -; GFX9-NEXT: s_cmp_ge_u32 s0, s11 +; GFX9-NEXT: s_cmp_ge_u32 s0, s9 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s11 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 +; GFX9-NEXT: s_cmp_eq_u32 s0, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -8562,7 +8554,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] +; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 @@ -8589,8 +8581,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: sdiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8613,25 +8605,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: sdiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 ; GFX9-NEXT: s_add_u32 s4, s6, s4 ; GFX9-NEXT: s_addc_u32 s5, s7, 0 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8651,8 +8643,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s2, 0x2ff2fc01 ; GFX6-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -8752,17 +8744,17 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x2ff2fc01 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 ; GFX9-NEXT: s_add_u32 s4, 0xe037f, s8 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 @@ -8846,11 +8838,11 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: s_sub_u32 s5, s6, s4 ; GFX9-NEXT: s_subb_u32 s4, s7, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8873,36 +8865,37 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 -; GFX6-NEXT: s_lshl_b64 s[14:15], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s16, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s16 -; GFX6-NEXT: s_mov_b32 s17, s16 -; GFX6-NEXT: s_addc_u32 s1, s1, s16 -; GFX6-NEXT: s_xor_b64 s[12:13], s[0:1], s[16:17] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX6-NEXT: s_sub_u32 s0, 0, s12 -; GFX6-NEXT: s_subb_u32 s1, 0, s13 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[12:13], 0x1000, s10 +; GFX6-NEXT: s_ashr_i32 s14, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s14 +; GFX6-NEXT: s_mov_b32 s15, s14 +; GFX6-NEXT: s_addc_u32 s3, s3, s14 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[14:15] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX6-NEXT: s_sub_u32 s10, 0, s2 +; GFX6-NEXT: s_subb_u32 s11, 0, s3 +; GFX6-NEXT: s_ashr_i32 s16, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GFX6-NEXT: s_add_u32 s0, s4, s16 +; GFX6-NEXT: s_mov_b32 s17, s16 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX6-NEXT: s_addc_u32 s1, s5, s16 +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -8921,11 +8914,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -8941,11 +8935,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: s_add_u32 s0, s4, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s5, s2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 @@ -8955,28 +8946,29 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -8985,23 +8977,23 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] -; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[16:17] -; GFX6-NEXT: s_ashr_i32 s2, s15, 31 -; GFX6-NEXT: s_add_u32 s4, s14, s2 +; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[14:15] +; GFX6-NEXT: s_ashr_i32 s4, s13, 31 +; GFX6-NEXT: s_add_u32 s12, s12, s4 ; GFX6-NEXT: v_mov_b32_e32 v6, s5 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s5, s15, s2 -; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX6-NEXT: s_mov_b32 s5, s4 +; GFX6-NEXT: s_addc_u32 s13, s13, s4 +; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 @@ -9010,16 +9002,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: s_sub_u32 s12, 0, s4 +; GFX6-NEXT: s_sub_u32 s2, 0, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v3 -; GFX6-NEXT: s_subb_u32 s13, 0, s5 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX6-NEXT: s_subb_u32 s3, 0, s13 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 @@ -9038,11 +9030,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 @@ -9057,14 +9049,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: s_ashr_i32 s12, s7, 31 +; GFX6-NEXT: s_ashr_i32 s2, s7, 31 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s6, s6, s12 +; GFX6-NEXT: s_add_u32 s6, s6, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_addc_u32 s7, s7, s12 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s7, s7, s2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[12:13] +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 @@ -9080,25 +9072,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s4, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, s5, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s5 +; GFX6-NEXT: v_mov_b32_e32 v7, s13 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s4, v5 +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] @@ -9109,15 +9101,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s7 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[2:3] +; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 @@ -9130,19 +9122,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 ; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] +; GFX9-NEXT: s_addc_u32 s3, s3, s8 +; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 ; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 @@ -9416,6 +9408,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y @@ -9432,6 +9425,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_oddk_denom: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x33fe64 ; GFX6-NEXT: s_add_u32 s0, 0x396, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x28100000 @@ -9451,7 +9445,6 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, s1, v2 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v3, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc @@ -9546,7 +9539,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX9-NEXT: s_mul_i32 s10, s4, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 ; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 ; GFX9-NEXT: s_addc_u32 s6, s8, s9 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 @@ -9633,7 +9626,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9653,7 +9646,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: srem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -9681,21 +9674,21 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s4 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_addc_u32 s1, s1, s4 -; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] +; GFX6-NEXT: s_addc_u32 s3, s3, s4 +; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: s_sub_u32 s4, 0, s8 ; GFX6-NEXT: s_subb_u32 s5, 0, s9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9815,17 +9808,17 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s4 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] +; GFX9-NEXT: s_addc_u32 s3, s3, s4 +; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s8 ; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 @@ -9979,8 +9972,8 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: srem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10007,17 +10000,17 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: srem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 -; GFX9-NEXT: s_sub_u32 s2, s4, s2 -; GFX9-NEXT: s_subb_u32 s3, s5, s3 +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s0, s4, s0 +; GFX9-NEXT: s_subb_u32 s1, s5, s1 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 ; GFX9-NEXT: s_add_u32 s4, s6, s4 @@ -10025,11 +10018,11 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 ; GFX9-NEXT: s_sub_u32 s4, s6, s4 ; GFX9-NEXT: s_subb_u32 s5, s7, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = srem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -10052,36 +10045,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 ; GFX6-NEXT: s_lshl_b64 s[16:17], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s8, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s8 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s1, s1, s8 -; GFX6-NEXT: s_xor_b64 s[14:15], s[0:1], s[8:9] +; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX6-NEXT: s_sub_u32 s0, 0, s14 -; GFX6-NEXT: s_subb_u32 s1, 0, s15 +; GFX6-NEXT: s_sub_u32 s2, 0, s14 +; GFX6-NEXT: s_subb_u32 s3, 0, s15 ; GFX6-NEXT: s_ashr_i32 s12, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX6-NEXT: s_addc_u32 s1, s5, s12 +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -10100,11 +10096,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -10120,11 +10116,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s5, s12 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 @@ -10305,19 +10298,19 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: srem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 ; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] +; GFX9-NEXT: s_addc_u32 s3, s3, s8 +; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 ; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll index 52e76dd24a20b4..9f5b6389ab59f5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll @@ -6,7 +6,7 @@ define weak_odr amdgpu_kernel void @test_mul24_knownbits_kernel(ptr addrspace(1) ; GCN-LABEL: test_mul24_knownbits_kernel: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_and_b32_e32 v0, 3, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: v_mul_i32_i24_e32 v0, -5, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffffe0, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll index 1358d91ae102c9..a35fbaadddf9ef 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll @@ -240,7 +240,7 @@ entry: define void @sincos_v2f32_nocontract(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f32_nocontract -; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { +; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -298,7 +298,7 @@ entry: define void @sincos_v2f32(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f32 -; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -317,7 +317,7 @@ entry: define void @sincos_v3f32(<3 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v3f32 -; CHECK-SAME: (<3 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<3 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <3 x float>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <3 x float> @_Z6sincosDv3_fPU3AS5S_(<3 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -340,7 +340,7 @@ entry: define void @sincos_v4f32(<4 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v4f32 -; CHECK-SAME: (<4 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<4 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <4 x float>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <4 x float> @_Z6sincosDv4_fPU3AS5S_(<4 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -359,7 +359,7 @@ entry: define void @sincos_v8f32(<8 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v8f32 -; CHECK-SAME: (<8 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<8 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <8 x float>, align 32, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <8 x float> @_Z6sincosDv8_fPU3AS5S_(<8 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -378,7 +378,7 @@ entry: define void @sincos_v16f32(<16 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v16f32 -; CHECK-SAME: (<16 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<16 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <16 x float>, align 64, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <16 x float> @_Z6sincosDv16_fPU3AS5S_(<16 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -397,7 +397,7 @@ entry: define void @sincos_f64_nocontract(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f64_nocontract -; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -417,7 +417,7 @@ entry: define void @sincos_v2f64_nocontract(<2 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f64_nocontract -; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x double>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call <2 x double> @_Z6sincosDv2_dPU3AS5S_(<2 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -436,7 +436,7 @@ entry: define void @sincos_f64(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f64 -; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -455,7 +455,7 @@ entry: define void @sincos_f64_order1(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f64_order1 -; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -474,7 +474,7 @@ entry: define void @sincos_v2f64(<2 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f64 -; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x double>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <2 x double> @_Z6sincosDv2_dPU3AS5S_(<2 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -493,7 +493,7 @@ entry: define void @sincos_v3f64(<3 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v3f64 -; CHECK-SAME: (<3 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<3 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <3 x double>, align 32, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <3 x double> @_Z6sincosDv3_dPU3AS5S_(<3 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -516,7 +516,7 @@ entry: define void @sincos_v4f64(<4 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v4f64 -; CHECK-SAME: (<4 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<4 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <4 x double>, align 32, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <4 x double> @_Z6sincosDv4_dPU3AS5S_(<4 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -535,7 +535,7 @@ entry: define void @sincos_v8f64(<8 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v8f64 -; CHECK-SAME: (<8 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<8 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <8 x double>, align 64, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <8 x double> @_Z6sincosDv8_dPU3AS5S_(<8 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -554,7 +554,7 @@ entry: define void @sincos_v16f64(<16 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v16f64 -; CHECK-SAME: (<16 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<16 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <16 x double>, align 128, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <16 x double> @_Z6sincosDv16_dPU3AS5S_(<16 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -666,7 +666,7 @@ bb1: define float @select_sin_or_cos_f32(i1 %cond, float %x) { ; CHECK-LABEL: define float @select_sin_or_cos_f32 -; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] { +; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -685,7 +685,7 @@ declare void @func(ptr addrspace(1)) define void @sincos_f32_value_is_instr(ptr addrspace(1) %value.ptr, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f32_value_is_instr -; CHECK-SAME: (ptr addrspace(1) [[VALUE_PTR:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (ptr addrspace(1) [[VALUE_PTR:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: tail call void @func(ptr addrspace(1) [[VALUE_PTR]]) @@ -838,7 +838,7 @@ entry: define void @sincos_v2f32_flag_intersect1(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f32_flag_intersect1 -; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call nnan contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -859,7 +859,7 @@ declare void @use_stack_ptrs(ptr addrspace(5), ptr addrspace(5)) define void @sincos_f32_alloca_insertpt(float %x) { ; CHECK-LABEL: define void @sincos_f32_alloca_insertpt -; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5) ; CHECK-NEXT: [[ALLOCA1:%.*]] = alloca i32, align 4, addrspace(5) @@ -884,7 +884,7 @@ entry: define float @sincos_f32_unused_result_cos(float %x) { ; CHECK-LABEL: define float @sincos_f32_unused_result_cos -; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] { +; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SIN:%.*]] = tail call contract float @_Z3sinf(float [[X]]) ; CHECK-NEXT: ret float [[SIN]] @@ -899,7 +899,7 @@ entry: define float @sincos_f32_unused_result_sin(float %x) { ; CHECK-LABEL: define float @sincos_f32_unused_result_sin -; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR6]] { +; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[COS:%.*]] = tail call contract float @_Z3cosf(float [[X]]) ; CHECK-NEXT: ret float [[COS]] @@ -914,7 +914,7 @@ entry: define void @sincos_f32_repeated_uses(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) { ; CHECK-LABEL: define void @sincos_f32_repeated_uses -; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] { +; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -947,7 +947,7 @@ entry: define void @sin_f32_indirect_call_user(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out, ptr %func.ptr) { ; CHECK-LABEL: define void @sin_f32_indirect_call_user -; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]]) ; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4 @@ -965,7 +965,7 @@ entry: define void @cos_f32_indirect_call_user(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out, ptr %func.ptr) { ; CHECK-LABEL: define void @cos_f32_indirect_call_user -; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3cosf(float [[X]]) ; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[COS_OUT]], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 9ec8e425a3f55c..bd61558905f634 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -37,9 +37,9 @@ ; by 4 bytes. ; HSA-ALLOCA: .amdhsa_private_segment_fixed_size 24 -; HSA-ALLOCA: s_add_i32 s12, s12, s17 -; HSA-ALLOCA-DAG: s_mov_b32 flat_scratch_lo, s13 -; HSA-ALLOCA-DAG: s_lshr_b32 flat_scratch_hi, s12, 8 +; HSA-ALLOCA: s_add_i32 s6, s6, s9 +; HSA-ALLOCA: s_mov_b32 flat_scratch_lo, s7 +; HSA-ALLOCA: s_lshr_b32 flat_scratch_hi, s6, 8 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll index 8cda553e61c8ad..cc116dfe807ecd 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll @@ -9,8 +9,8 @@ ; Legacy intrinsics that just read implicit parameters ; FUNC-LABEL: {{^}}ngroups_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x0 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x0 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -24,8 +24,8 @@ entry: } ; FUNC-LABEL: {{^}}ngroups_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x1 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x4 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -39,8 +39,8 @@ entry: } ; FUNC-LABEL: {{^}}ngroups_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x2 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x8 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -54,8 +54,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x3 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0xc +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -69,8 +69,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x4 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x10 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -84,8 +84,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x5 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x14 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -99,8 +99,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x6 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x18 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -114,8 +114,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x7 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x1c +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -129,8 +129,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x8 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x20 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll index 91abbfff7f2dee..87084d780410b1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll @@ -23,7 +23,7 @@ ; ELF: Section: .text (0x2) ; ELF: } -; GFX10: NumSGPRsForWavesPerEU: 4 +; GFX10: NumSGPRsForWavesPerEU: 2 ; GFX10: NumVGPRsForWavesPerEU: 1 define amdgpu_kernel void @simple(ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll index 1f8da18cdd3014..897e134ee48d83 100644 --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -9,8 +9,8 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; GCN-LABEL: anyext_i1_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -22,8 +22,8 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; ; GFX8-LABEL: anyext_i1_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -37,17 +37,17 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; ; GFX9-LABEL: anyext_i1_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp = icmp eq i32 %cond, 0 @@ -62,8 +62,8 @@ entry: define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { ; GCN-LABEL: s_anyext_i16_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s14, 0 ; GCN-NEXT: s_mov_b32 s15, s11 @@ -88,8 +88,8 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: s_anyext_i16_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s7 @@ -113,13 +113,13 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: s_anyext_i16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v3, v1, s[0:1] +; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index fb764560154d58..624101dc12c5f0 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -24,18 +24,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB0_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -52,18 +52,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -80,18 +80,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -107,10 +107,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -118,10 +118,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -131,25 +130,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -160,7 +158,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -168,7 +166,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -176,8 +174,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -191,24 +189,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -223,7 +221,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -231,7 +229,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -239,8 +237,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -254,24 +252,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: add_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -292,23 +290,23 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -321,24 +319,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -351,24 +349,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -380,16 +378,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -397,10 +395,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -410,37 +407,37 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -448,7 +445,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -456,8 +453,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -471,41 +468,41 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -513,7 +510,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -521,8 +518,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -536,32 +533,32 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -574,8 +571,8 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -586,36 +583,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -627,36 +624,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -667,38 +664,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -708,37 +704,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -748,182 +743,174 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB2_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -937,11 +924,11 @@ entry: define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) { ; GFX6-LABEL: struct_add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x11 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -951,38 +938,38 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX8-LABEL: struct_add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB3_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dword s5, s[2:3], 0x44 -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB3_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -994,38 +981,38 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB3_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB3_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1036,41 +1023,40 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX10W64-LABEL: struct_add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dword s5, s[2:3], 0x44 -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dword s5, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mov_b32_e32 v2, s5 ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB3_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -1080,40 +1066,39 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX10W32-LABEL: struct_add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dword s8, s[2:3], 0x44 -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mov_b32_e32 v2, s8 ; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB3_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -1123,192 +1108,186 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX11W64-LABEL: struct_add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b32 s5, s[2:3], 0x44 -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b32 s5, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s5 -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB3_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: struct_add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b32 s8, s[2:3], 0x44 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8 -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: v_mov_b32_e32 v2, s8 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB3_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: struct_add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b32 s5, s[2:3], 0x44 -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: s_load_b32 s5, s[0:1], 0x44 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, s5 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB3_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: struct_add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b32 s8, s[2:3], 0x44 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: v_mov_b32_e32 v2, s8 +; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB3_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1322,8 +1301,8 @@ entry: define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc @@ -1335,9 +1314,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1348,9 +1327,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1360,10 +1339,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1371,67 +1349,33 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: add_i32_varying_offset: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm -; -; GFX11W32-LABEL: add_i32_varying_offset: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm -; -; GFX12W64-LABEL: add_i32_varying_offset: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: add_i32_varying_offset: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX11-LABEL: add_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: add_i32_varying_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) @@ -1447,18 +1391,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1476,18 +1420,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1505,18 +1449,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1533,10 +1477,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -1544,10 +1488,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1558,25 +1501,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1588,7 +1530,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1596,7 +1538,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1604,8 +1546,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1620,24 +1562,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1653,7 +1595,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1661,7 +1603,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1669,8 +1611,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1685,24 +1627,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: sub_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1724,23 +1666,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB6_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB6_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1753,24 +1695,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB6_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1783,24 +1725,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1812,16 +1754,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1829,8 +1771,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1842,38 +1784,38 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1881,7 +1823,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1889,8 +1831,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1905,42 +1847,42 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1948,7 +1890,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1956,8 +1898,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1972,33 +1914,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -2011,8 +1953,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -2023,36 +1965,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB7_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB7_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -2064,36 +2006,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB7_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2104,38 +2046,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -2145,37 +2086,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -2185,184 +2125,176 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB7_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB7_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB7_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -2376,8 +2308,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc @@ -2389,9 +2321,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2402,9 +2334,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2414,10 +2346,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2425,73 +2356,36 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: sub_i32_varying_offset: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm -; -; GFX11W32-LABEL: sub_i32_varying_offset: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm -; -; GFX12W64-LABEL: sub_i32_varying_offset: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: sub_i32_varying_offset: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX11-LABEL: sub_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sub_i32_varying_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} -; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index ca4812f345958a..d3944d3d52d776 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -17,7 +17,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -48,7 +48,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX89-LABEL: add_i32_constant: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b64 s[6:7], exec ; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -80,7 +80,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -114,7 +114,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -184,7 +184,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: add_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -220,7 +220,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: add_i32_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -256,7 +256,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: add_i32_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: s_mov_b32 s5, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -297,25 +297,25 @@ entry: define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) { ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s8, s6 -; GFX7LESS-NEXT: s_mov_b32 s9, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 -; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB1_2: @@ -324,36 +324,36 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 +; GFX8-NEXT: s_mul_i32 s2, s8, s2 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 @@ -365,29 +365,29 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 +; GFX9-NEXT: s_mul_i32 s2, s8, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 @@ -400,20 +400,20 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s10, s2 +; GFX1064-NEXT: s_mul_i32 s2, s8, s2 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b32 s12, s6 @@ -429,28 +429,28 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v0, s[0:1] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] ; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc @@ -459,38 +459,38 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] ; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s3, s2, s3 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 -; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: s_mul_i32 s2, s8, s2 +; GFX1164-NEXT: s_mov_b32 s14, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv @@ -501,7 +501,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s6, -1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1] +; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] ; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -510,17 +510,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: add_i32_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 @@ -548,26 +548,26 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: add_i32_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1264-NEXT: s_mov_b64 s[8:9], exec +; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1264-NEXT: s_mov_b64 s[2:3], exec ; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264-NEXT: s_cbranch_execz .LBB1_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s3, s2, s3 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: s_mul_i32 s2, s8, s2 +; GFX1264-NEXT: s_mov_b32 s14, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s2 +; GFX1264-NEXT: s_mov_b32 s12, s6 +; GFX1264-NEXT: s_mov_b32 s13, s7 +; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB1_2: @@ -577,7 +577,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1] +; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s8, v0, s[0:1] ; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1264-NEXT: s_nop 0 ; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -586,17 +586,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: add_i32_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s8, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1232-NEXT: s_mov_b32 s2, exec_lo ; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB1_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_i32 s2, s0, s2 @@ -628,7 +628,7 @@ entry: define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS-LABEL: add_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_mov_b32 s10, s6 @@ -646,22 +646,22 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s4 ; GFX8-NEXT: v_readlane_b32 s7, v0, s4 ; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8-NEXT: v_writelane_b32 v1, s6, m0 ; GFX8-NEXT: s_add_i32 s6, s6, s7 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -691,22 +691,22 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s4 ; GFX9-NEXT: v_readlane_b32 s7, v0, s4 ; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9-NEXT: v_writelane_b32 v1, s6, m0 ; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -736,21 +736,21 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s6, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7 ; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] ; GFX1064-NEXT: s_add_i32 s6, s6, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -782,21 +782,21 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s1 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1032-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032-NEXT: s_andn2_b32 s0, s0, s6 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s6 ; GFX1032-NEXT: s_add_i32 s4, s4, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 @@ -827,51 +827,49 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s6, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1164-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] ; GFX1164-NEXT: s_add_i32 s6, s6, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164-NEXT: s_cbranch_execz .LBB2_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mov_b32 s8, s2 ; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB2_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 @@ -880,49 +878,47 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1132-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1132-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6 ; GFX1132-NEXT: s_add_i32 s4, s4, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1132-NEXT: s_cbranch_execz .LBB2_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mov_b32 s8, s2 ; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB2_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 @@ -931,50 +927,48 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264-LABEL: add_i32_varying: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1264-NEXT: s_mov_b64 s[0:1], exec +; GFX1264-NEXT: s_mov_b64 s[2:3], exec ; GFX1264-NEXT: s_mov_b32 s6, 0 -; GFX1264-NEXT: ; implicit-def: $vgpr0 +; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1264-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1264-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1264-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] ; GFX1264-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1264-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1264-NEXT: ; implicit-def: $vgpr1 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264-NEXT: ; implicit-def: $vgpr0 ; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264-NEXT: s_cbranch_execz .LBB2_4 ; GFX1264-NEXT: ; %bb.3: -; GFX1264-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB2_4: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1264-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264-NEXT: s_nop 0 @@ -983,48 +977,46 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232-LABEL: add_i32_varying: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s0, exec_lo +; GFX1232-NEXT: s_mov_b32 s2, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, 0 -; GFX1232-NEXT: ; implicit-def: $vgpr0 +; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1232-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1232-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1232-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1232-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1232-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6 ; GFX1232-NEXT: s_add_co_i32 s4, s4, s5 -; GFX1232-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1232-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1232-NEXT: ; implicit-def: $vgpr1 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232-NEXT: ; implicit-def: $vgpr0 ; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1232-NEXT: s_cbranch_execz .LBB2_4 ; GFX1232-NEXT: ; %bb.3: -; GFX1232-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB2_4: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1232-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_nop 0 @@ -1041,7 +1033,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1079,7 +1071,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX89-LABEL: add_i64_constant: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b64 s[6:7], exec ; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1116,7 +1108,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1152,7 +1144,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1187,7 +1179,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1226,7 +1218,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1263,7 +1255,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: add_i64_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1302,7 +1294,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: add_i64_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: s_mov_b32 s5, 0 ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 @@ -1346,8 +1338,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1390,8 +1382,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 @@ -1430,24 +1422,24 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s1, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s6 +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s0, s6 +; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -1456,38 +1448,38 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v2, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s1, v2, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2] ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i64_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s1, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1064-NEXT: s_mul_i32 s8, s0, s8 +; GFX1064-NEXT: s_mul_i32 s9, s3, s8 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 +; GFX1064-NEXT: s_mul_i32 s8, s2, s8 ; GFX1064-NEXT: s_add_i32 s10, s10, s9 ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 @@ -1500,37 +1492,37 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v2, s[2:3] -; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s1, v2, v[1:2] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1] +; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2] ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i64_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s1, s3 -; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1032-NEXT: s_mul_i32 s3, s0, s3 +; GFX1032-NEXT: s_mul_i32 s8, s3, s1 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_mov_b32 s8, s6 @@ -1541,22 +1533,22 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v2, s[2:3] -; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s1, v2, v[1:2] +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1] +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s3, v2, v[1:2] ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i64_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -1602,17 +1594,17 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: add_i64_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s8, s1, s3 @@ -1648,8 +1640,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: add_i64_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -1690,18 +1682,18 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: add_i64_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s9, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1232-NEXT: s_mov_b32 s2, exec_lo ; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB4_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] @@ -1735,7 +1727,7 @@ entry: define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS-LABEL: add_i64_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -1754,7 +1746,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX89-LABEL: add_i64_varying: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -1773,7 +1765,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: add_i64_varying: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 @@ -1791,93 +1783,48 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; -; GFX1164-LABEL: add_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 -; GFX1164-NEXT: s_mov_b32 s11, s7 -; GFX1164-NEXT: s_mov_b32 s10, s6 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s8, s2 -; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: s_mov_b32 s4, s0 -; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: s_mov_b32 s5, s1 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: add_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_mov_b32 s11, s7 -; GFX1132-NEXT: s_mov_b32 s10, s6 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s8, s2 -; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: s_mov_b32 s4, s0 -; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: s_mov_b32 s5, s1 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm -; -; GFX1264-LABEL: add_i64_varying: -; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_mov_b32 s11, s7 -; GFX1264-NEXT: s_mov_b32 s10, s6 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: s_mov_b32 s4, s0 -; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: s_mov_b32 s5, s1 -; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1264-NEXT: s_endpgm -; -; GFX1232-LABEL: add_i64_varying: -; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_mov_b32 s11, s7 -; GFX1232-NEXT: s_mov_b32 s10, s6 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s8, s2 -; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: s_mov_b32 s4, s0 -; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: s_mov_b32 s5, s1 -; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1232-NEXT: s_endpgm +; GFX11-LABEL: add_i64_varying: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: add_i64_varying: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -1890,7 +1837,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -1922,7 +1869,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -1955,7 +1902,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -1988,7 +1935,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: sub_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -2023,7 +1970,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -2057,7 +2004,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: sub_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -2095,7 +2042,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: sub_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -2132,7 +2079,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: sub_i32_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -2169,7 +2116,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: sub_i32_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: s_mov_b32 s5, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -2211,25 +2158,25 @@ entry: define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %subitive) { ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s8, s6 -; GFX7LESS-NEXT: s_mov_b32 s9, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 -; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB7_2: @@ -2238,36 +2185,36 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 +; GFX8-NEXT: s_mul_i32 s2, s8, s2 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 @@ -2279,29 +2226,29 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 +; GFX9-NEXT: s_mul_i32 s2, s8, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 @@ -2314,20 +2261,20 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s10, s2 +; GFX1064-NEXT: s_mul_i32 s2, s8, s2 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b32 s12, s6 @@ -2340,7 +2287,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s10, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -2351,21 +2298,21 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc @@ -2374,9 +2321,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 @@ -2387,33 +2334,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB7_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s3, s2, s3 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 -; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: s_mul_i32 s2, s8, s2 +; GFX1164-NEXT: s_mov_b32 s14, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB7_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s6, -1 @@ -2427,17 +2374,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 @@ -2466,32 +2413,32 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: sub_i32_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1264-NEXT: s_mov_b64 s[8:9], exec +; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1264-NEXT: s_mov_b64 s[2:3], exec ; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264-NEXT: s_cbranch_execz .LBB7_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s3, s2, s3 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: s_mul_i32 s2, s8, s2 +; GFX1264-NEXT: s_mov_b32 s14, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s2 +; GFX1264-NEXT: s_mov_b32 s12, s6 +; GFX1264-NEXT: s_mov_b32 s13, s7 +; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB7_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 @@ -2505,17 +2452,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: sub_i32_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s8, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1232-NEXT: s_mov_b32 s2, exec_lo ; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB7_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_i32 s2, s0, s2 @@ -2548,7 +2495,7 @@ entry: define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS-LABEL: sub_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_mov_b32 s10, s6 @@ -2566,22 +2513,22 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB8_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s4 ; GFX8-NEXT: v_readlane_b32 s7, v0, s4 ; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8-NEXT: v_writelane_b32 v1, s6, m0 ; GFX8-NEXT: s_add_i32 s6, s6, s7 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -2611,22 +2558,22 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB8_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s4 ; GFX9-NEXT: v_readlane_b32 s7, v0, s4 ; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9-NEXT: v_writelane_b32 v1, s6, m0 ; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -2656,21 +2603,21 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s6, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7 ; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] ; GFX1064-NEXT: s_add_i32 s6, s6, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -2702,21 +2649,21 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s1 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1032-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032-NEXT: s_andn2_b32 s0, s0, s6 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s6 ; GFX1032-NEXT: s_add_i32 s4, s4, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 @@ -2747,51 +2694,49 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s6, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1164-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] ; GFX1164-NEXT: s_add_i32 s6, s6, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mov_b32 s8, s2 ; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB8_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 @@ -2800,49 +2745,47 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1132-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1132-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6 ; GFX1132-NEXT: s_add_i32 s4, s4, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1132-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mov_b32 s8, s2 ; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB8_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 @@ -2851,50 +2794,48 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264-LABEL: sub_i32_varying: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1264-NEXT: s_mov_b64 s[0:1], exec +; GFX1264-NEXT: s_mov_b64 s[2:3], exec ; GFX1264-NEXT: s_mov_b32 s6, 0 -; GFX1264-NEXT: ; implicit-def: $vgpr0 +; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1264-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1264-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1264-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] ; GFX1264-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1264-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1264-NEXT: ; implicit-def: $vgpr1 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264-NEXT: ; implicit-def: $vgpr0 ; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264-NEXT: s_cbranch_execz .LBB8_4 ; GFX1264-NEXT: ; %bb.3: -; GFX1264-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB8_4: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264-NEXT: s_nop 0 @@ -2903,48 +2844,46 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232-LABEL: sub_i32_varying: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s0, exec_lo +; GFX1232-NEXT: s_mov_b32 s2, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, 0 -; GFX1232-NEXT: ; implicit-def: $vgpr0 +; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1232-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1232-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1232-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1232-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1232-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6 ; GFX1232-NEXT: s_add_co_i32 s4, s4, s5 -; GFX1232-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1232-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1232-NEXT: ; implicit-def: $vgpr1 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232-NEXT: ; implicit-def: $vgpr0 ; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1232-NEXT: s_cbranch_execz .LBB8_4 ; GFX1232-NEXT: ; %bb.3: -; GFX1232-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB8_4: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_nop 0 @@ -2961,7 +2900,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -2999,7 +2938,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -3037,7 +2976,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -3075,7 +3014,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -3114,7 +3053,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -3152,7 +3091,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -3194,7 +3133,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -3234,7 +3173,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: sub_i64_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -3276,7 +3215,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: sub_i64_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: s_mov_b32 s5, 0 ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 @@ -3323,8 +3262,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -3367,8 +3306,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 @@ -3408,24 +3347,24 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s1, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s6 +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s0, s6 +; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -3434,12 +3373,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB10_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s0, v2, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s1, v2, v[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 @@ -3452,22 +3391,22 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s1, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1064-NEXT: s_mul_i32 s8, s0, s8 +; GFX1064-NEXT: s_mul_i32 s9, s3, s8 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 +; GFX1064-NEXT: s_mul_i32 s8, s2, s8 ; GFX1064-NEXT: s_add_i32 s10, s10, s9 ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 @@ -3480,12 +3419,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB10_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s0, v2, 0 +; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s1, v2, v[4:5] +; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3 @@ -3497,23 +3436,23 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s1, s3 -; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1032-NEXT: s_mul_i32 s3, s0, s3 +; GFX1032-NEXT: s_mul_i32 s8, s3, s1 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_mov_b32 s8, s6 @@ -3524,14 +3463,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB10_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s0, v2, 0 +; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s2, v2, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s1, v2, v[4:5] +; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s3, v2, v[4:5] ; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo @@ -3541,8 +3480,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-LABEL: sub_i64_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -3590,17 +3529,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: sub_i64_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB10_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s8, s1, s3 @@ -3638,8 +3577,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: sub_i64_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -3684,18 +3623,18 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: sub_i64_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s9, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1232-NEXT: s_mov_b32 s2, exec_lo ; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB10_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] @@ -3733,7 +3672,7 @@ entry: define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS-LABEL: sub_i64_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -3752,7 +3691,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX89-LABEL: sub_i64_varying: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -3771,7 +3710,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: sub_i64_varying: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 @@ -3789,93 +3728,48 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; -; GFX1164-LABEL: sub_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 -; GFX1164-NEXT: s_mov_b32 s11, s7 -; GFX1164-NEXT: s_mov_b32 s10, s6 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s8, s2 -; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: s_mov_b32 s4, s0 -; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: s_mov_b32 s5, s1 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: sub_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_mov_b32 s11, s7 -; GFX1132-NEXT: s_mov_b32 s10, s6 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s8, s2 -; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: s_mov_b32 s4, s0 -; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: s_mov_b32 s5, s1 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm -; -; GFX1264-LABEL: sub_i64_varying: -; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_mov_b32 s11, s7 -; GFX1264-NEXT: s_mov_b32 s10, s6 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: s_mov_b32 s4, s0 -; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: s_mov_b32 s5, s1 -; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1264-NEXT: s_endpgm -; -; GFX1232-LABEL: sub_i64_varying: -; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_mov_b32 s11, s7 -; GFX1232-NEXT: s_mov_b32 s10, s6 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s8, s2 -; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: s_mov_b32 s4, s0 -; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: s_mov_b32 s5, s1 -; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1232-NEXT: s_endpgm +; GFX11-LABEL: sub_i64_varying: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sub_i64_varying: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -3883,6 +3777,3 @@ entry: store i64 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} -; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 3784af443c7f1f..b0b40aa952a9fb 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -24,7 +24,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -35,8 +35,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB0_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -52,7 +52,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -63,8 +63,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -80,7 +80,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -90,8 +90,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -107,7 +107,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -119,9 +119,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -132,25 +131,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s3 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -162,7 +160,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -179,8 +177,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB0_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -194,24 +192,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: add_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_mul_i32 s1, s1, 5 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 +; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB0_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -234,12 +232,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -251,8 +249,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -264,13 +262,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -282,8 +280,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -295,13 +293,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -312,8 +310,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -325,13 +323,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB1_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -344,9 +342,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -357,40 +354,39 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB1_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s4, s0, s4 +; GFX1032-NEXT: s_mul_i32 s4, s2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -408,8 +404,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB1_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -423,9 +419,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1132-LABEL: add_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -434,22 +430,22 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s4, s0, s4 +; GFX1132-NEXT: s_mul_i32 s4, s2, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB1_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] -; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 +; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -464,7 +460,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: add_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -477,27 +473,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -506,8 +502,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -518,27 +514,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -546,8 +542,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -558,26 +554,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064-NEXT: s_add_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execz .LBB2_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -587,9 +583,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 @@ -600,37 +595,36 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_add_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_add_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_cbranch_execz .LBB2_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 @@ -641,45 +635,43 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164-NEXT: s_add_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB2_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB2_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -689,42 +681,41 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_add_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_add_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB2_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB2_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -863,17 +854,16 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1164-LABEL: add_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s2, 0 ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: s_add_i32 s2, s2, s6 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 @@ -896,17 +886,16 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1132-LABEL: add_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: s_add_i32 s0, s0, s3 ; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 @@ -940,7 +929,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -951,8 +940,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB4_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -973,7 +962,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -984,10 +973,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] @@ -1005,7 +994,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1015,10 +1004,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] @@ -1036,7 +1025,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1048,9 +1037,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] @@ -1062,25 +1050,24 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] @@ -1093,7 +1080,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -1110,8 +1097,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1126,25 +1113,25 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s1, s1, 5 +; GFX1132-NEXT: s_mul_i32 s3, s3, 5 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s1 +; GFX1132-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1168,7 +1155,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1209,7 +1196,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1247,7 +1234,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1285,7 +1272,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1064-LABEL: add_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1321,7 +1308,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1032-LABEL: add_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1356,7 +1343,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1164-LABEL: add_i64_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1397,7 +1384,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1132-LABEL: add_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1445,7 +1432,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: add_i64_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1460,7 +1447,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1472,7 +1459,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9-LABEL: add_i64_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1484,7 +1471,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX10-LABEL: add_i64_varying: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1494,36 +1481,20 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; -; GFX1164-LABEL: add_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: add_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX11-LABEL: add_i64_varying: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -1542,7 +1513,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1553,8 +1524,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB7_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -1571,7 +1542,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1582,8 +1553,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -1600,7 +1571,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1610,8 +1581,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -1628,7 +1599,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1640,9 +1611,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB7_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -1654,25 +1624,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s3 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -1685,7 +1654,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: sub_i32_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1702,8 +1671,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB7_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -1718,24 +1687,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: sub_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_mul_i32 s1, s1, 5 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 +; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB7_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -1759,12 +1728,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1776,8 +1745,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB8_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -1789,13 +1758,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB8_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1807,8 +1776,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB8_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -1820,13 +1789,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1837,8 +1806,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -1850,13 +1819,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1869,8 +1838,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -1882,40 +1851,40 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s4, s0, s4 +; GFX1032-NEXT: s_mul_i32 s4, s2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1933,8 +1902,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB8_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 @@ -1949,9 +1918,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1960,23 +1929,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s4, s0, s4 +; GFX1132-NEXT: s_mul_i32 s4, s2, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB8_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -1991,7 +1960,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: sub_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2004,27 +1973,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB9_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB9_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -2033,8 +2002,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB9_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -2045,27 +2014,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB9_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB9_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2073,8 +2042,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB9_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -2085,26 +2054,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064-NEXT: s_add_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execz .LBB9_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -2114,9 +2083,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB9_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 @@ -2127,37 +2095,36 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_add_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_add_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_cbranch_execz .LBB9_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB9_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 @@ -2168,45 +2135,43 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164-NEXT: s_add_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB9_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB9_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -2216,42 +2181,41 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_add_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_add_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB9_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB9_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -2390,17 +2354,16 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1164-LABEL: sub_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s2, 0 ; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: s_add_i32 s2, s2, s6 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 @@ -2423,17 +2386,16 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1132-LABEL: sub_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: s_add_i32 s0, s0, s3 ; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 @@ -2467,7 +2429,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2478,8 +2440,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB11_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -2500,7 +2462,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB11_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2511,8 +2473,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB11_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2533,7 +2495,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2543,8 +2505,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB11_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2565,7 +2527,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -2577,9 +2539,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB11_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -2594,25 +2555,24 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB11_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -2628,7 +2588,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -2645,8 +2605,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB11_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -2664,25 +2624,25 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s1, s1, 5 +; GFX1132-NEXT: s_mul_i32 s3, s3, 5 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s1 +; GFX1132-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB11_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -2709,7 +2669,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -2750,7 +2710,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -2789,7 +2749,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -2829,7 +2789,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -2868,7 +2828,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -2906,7 +2866,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1164-LABEL: sub_i64_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -2949,7 +2909,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1132-LABEL: sub_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -2999,7 +2959,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: sub_i64_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3014,7 +2974,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3026,7 +2986,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9-LABEL: sub_i64_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3038,7 +2998,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX10-LABEL: sub_i64_varying: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3048,36 +3008,20 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; -; GFX1164-LABEL: sub_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: sub_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX11-LABEL: sub_i64_varying: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -3091,7 +3035,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: and_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3104,27 +3048,27 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB14_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_and_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB14_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3133,8 +3077,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB14_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3145,27 +3089,27 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB14_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3173,8 +3117,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB14_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3185,26 +3129,26 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: and_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064-NEXT: s_and_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execz .LBB14_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -3214,9 +3158,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB14_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1 @@ -3227,37 +3170,36 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: and_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, -1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_and_b32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_and_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_cbranch_execz .LBB14_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB14_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1 @@ -3268,45 +3210,43 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: and_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164-NEXT: s_and_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB14_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_and_rtn_b32 v1, v1, v2 +; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB14_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3316,42 +3256,41 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: and_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, -1 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_and_b32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_and_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB14_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_and_rtn_b32 v1, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB14_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3370,7 +3309,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: or_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3383,27 +3322,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB15_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB15_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3412,8 +3351,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB15_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3424,27 +3363,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB15_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_or_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB15_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3452,8 +3391,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB15_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3464,26 +3403,26 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: or_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064-NEXT: s_or_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execz .LBB15_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -3493,9 +3432,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB15_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1 @@ -3506,37 +3444,36 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: or_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_or_b32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_or_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_cbranch_execz .LBB15_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB15_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1 @@ -3547,45 +3484,43 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: or_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164-NEXT: s_or_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB15_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_or_rtn_b32 v1, v1, v2 +; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB15_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3595,42 +3530,41 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: or_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_or_b32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_or_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB15_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_or_rtn_b32 v1, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB15_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3649,7 +3583,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: xor_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3662,27 +3596,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB16_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_xor_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB16_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3691,8 +3625,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB16_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3703,27 +3637,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB16_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_xor_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3731,8 +3665,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB16_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3743,26 +3677,26 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: xor_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064-NEXT: s_xor_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execz .LBB16_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -3772,9 +3706,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB16_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1 @@ -3785,37 +3718,36 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: xor_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_xor_b32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_xor_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_cbranch_execz .LBB16_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB16_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1 @@ -3826,45 +3758,43 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: xor_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164-NEXT: s_xor_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB16_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_xor_rtn_b32 v1, v1, v2 +; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB16_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3874,42 +3804,41 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: xor_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_xor_b32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_xor_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB16_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_xor_rtn_b32 v1, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB16_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3928,7 +3857,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: max_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3941,27 +3870,27 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_brev_b32 s4, 1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB17_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_max_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB17_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -3970,8 +3899,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB17_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3982,27 +3911,27 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_brev_b32 s4, 1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB17_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_max_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB17_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -4010,8 +3939,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB17_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -4022,26 +3951,26 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_brev_b32 s4, 1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064-NEXT: s_max_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execz .LBB17_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -4051,9 +3980,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB17_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1 @@ -4064,37 +3992,36 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_brev_b32 s0, 1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_brev_b32 s2, 1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_max_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_max_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_cbranch_execz .LBB17_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB17_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1 @@ -4105,45 +4032,43 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: max_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_brev_b32 s4, 1 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164-NEXT: s_max_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB17_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_max_rtn_i32 v1, v1, v2 +; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB17_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4153,42 +4078,41 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: max_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_brev_b32 s0, 1 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_brev_b32 s2, 1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_max_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_max_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB17_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_max_rtn_i32 v1, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB17_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4211,7 +4135,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -4221,8 +4145,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB18_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -4245,7 +4169,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB18_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -4255,10 +4179,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB18_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc @@ -4279,7 +4203,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -4288,10 +4212,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB18_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc @@ -4312,7 +4236,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB18_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -4323,9 +4247,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB18_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -4344,7 +4267,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB18_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -4355,9 +4278,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB18_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -4378,7 +4300,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -4388,8 +4310,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB18_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -4412,7 +4334,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -4421,8 +4343,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB18_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -4449,7 +4371,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: min_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -4462,27 +4384,27 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB19_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_min_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB19_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -4491,8 +4413,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB19_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -4503,27 +4425,27 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_brev_b32 s4, -2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB19_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_min_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB19_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -4531,8 +4453,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB19_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -4543,26 +4465,26 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_brev_b32 s4, -2 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064-NEXT: s_min_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execz .LBB19_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -4572,9 +4494,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB19_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1 @@ -4585,37 +4506,36 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_brev_b32 s0, -2 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_brev_b32 s2, -2 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_min_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_min_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_cbranch_execz .LBB19_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB19_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1 @@ -4626,45 +4546,43 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: min_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_brev_b32 s4, -2 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164-NEXT: s_min_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB19_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_min_rtn_i32 v1, v1, v2 +; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB19_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_e32 v0, s2, v0 +; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4674,42 +4592,41 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: min_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_brev_b32 s0, -2 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_brev_b32 s2, -2 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_min_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_min_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB19_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_min_rtn_i32 v1, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB19_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_e32 v0, s2, v0 +; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4732,7 +4649,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -4742,8 +4659,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB20_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -4766,7 +4683,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -4776,10 +4693,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB20_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -4800,7 +4717,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -4809,10 +4726,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB20_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -4833,7 +4750,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB20_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -4844,9 +4761,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB20_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -4865,7 +4781,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB20_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -4876,9 +4792,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB20_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -4899,7 +4814,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB20_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -4909,8 +4824,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB20_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -4933,7 +4848,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB20_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -4942,8 +4857,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB20_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -4970,7 +4885,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: umax_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -4983,27 +4898,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB21_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_max_u32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB21_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -5012,8 +4927,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB21_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -5024,27 +4939,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB21_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_max_u32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB21_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -5052,8 +4967,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB21_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -5064,26 +4979,26 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: umax_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064-NEXT: s_max_u32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execz .LBB21_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -5093,9 +5008,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB21_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1 @@ -5106,37 +5020,36 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: umax_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_max_u32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_max_u32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_cbranch_execz .LBB21_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB21_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1 @@ -5147,45 +5060,43 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: umax_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164-NEXT: s_max_u32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB21_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB21_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5195,42 +5106,41 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: umax_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_max_u32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_max_u32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB21_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB21_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5253,7 +5163,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -5263,8 +5173,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB22_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -5286,7 +5196,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -5296,8 +5206,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB22_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -5319,7 +5229,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -5328,8 +5238,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB22_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -5351,7 +5261,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -5362,9 +5272,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5383,7 +5292,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -5394,9 +5303,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -5417,7 +5325,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB22_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -5427,8 +5335,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB22_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -5451,7 +5359,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB22_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -5460,8 +5368,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB22_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 @@ -5488,7 +5396,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: umin_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5501,27 +5409,27 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB23_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_min_u32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB23_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -5530,8 +5438,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB23_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -5542,27 +5450,27 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB23_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_min_u32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB23_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -5570,8 +5478,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB23_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -5582,26 +5490,26 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: umin_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064-NEXT: s_min_u32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execz .LBB23_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -5611,9 +5519,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB23_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1 @@ -5624,37 +5531,36 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: umin_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, -1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_min_u32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_min_u32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_cbranch_execz .LBB23_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB23_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1 @@ -5665,45 +5571,43 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: umin_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164-NEXT: s_min_u32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB23_4 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_min_rtn_u32 v1, v1, v2 +; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB23_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5713,42 +5617,41 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: umin_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, -1 -; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_min_u32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_min_u32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB23_4 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_min_rtn_u32 v1, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB23_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -5771,7 +5674,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -5781,8 +5684,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB24_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -5804,7 +5707,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB24_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -5814,8 +5717,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB24_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5837,7 +5740,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -5846,8 +5749,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB24_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5869,7 +5772,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB24_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -5880,9 +5783,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB24_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5901,7 +5803,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB24_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -5912,9 +5814,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB24_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -5935,7 +5836,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB24_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -5945,8 +5846,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB24_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -5969,7 +5870,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB24_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -5978,8 +5879,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB24_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -6000,5 +5901,3 @@ entry: store i64 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 995d3fee672913..ca94d68f019177 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -23,18 +23,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB0_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -51,18 +51,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -79,18 +79,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -106,10 +106,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -117,10 +117,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -130,25 +129,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -159,7 +157,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -167,7 +165,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -175,8 +173,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -190,24 +188,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -222,7 +220,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -230,7 +228,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -238,8 +236,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -253,24 +251,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: add_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -291,23 +289,23 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -320,24 +318,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -350,24 +348,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -379,16 +377,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -396,10 +394,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -409,37 +406,37 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -447,7 +444,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -455,8 +452,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -470,41 +467,41 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -512,7 +509,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -520,8 +517,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -535,32 +532,32 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -573,8 +570,8 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -585,36 +582,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -626,36 +623,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -666,38 +663,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -707,37 +703,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -747,182 +742,174 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB2_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -936,8 +923,8 @@ entry: define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc @@ -949,9 +936,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -962,9 +949,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -974,10 +961,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -985,67 +971,33 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: add_i32_varying_offset: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm -; -; GFX11W32-LABEL: add_i32_varying_offset: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm +; GFX11-LABEL: add_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX12W64-LABEL: add_i32_varying_offset: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: add_i32_varying_offset: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX12-LABEL: add_i32_varying_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) @@ -1061,18 +1013,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB4_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB4_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1090,18 +1042,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1119,18 +1071,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1147,10 +1099,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -1158,10 +1110,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB4_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1172,25 +1123,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB4_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1202,7 +1152,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1210,7 +1160,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1218,8 +1168,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB4_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1234,24 +1184,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB4_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1267,7 +1217,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1275,7 +1225,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1283,8 +1233,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB4_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1299,24 +1249,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: sub_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB4_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1338,23 +1288,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1367,24 +1317,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1397,24 +1347,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1426,16 +1376,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1443,8 +1393,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1456,38 +1406,38 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1495,7 +1445,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1503,8 +1453,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1519,42 +1469,42 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1562,7 +1512,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1570,8 +1520,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1586,33 +1536,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1625,8 +1575,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -1637,36 +1587,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB6_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB6_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB6_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -1678,36 +1628,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB6_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB6_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB6_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1718,38 +1668,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1759,37 +1708,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB6_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1799,184 +1747,176 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB6_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1990,8 +1930,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc @@ -2003,9 +1943,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2016,9 +1956,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2028,10 +1968,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2039,73 +1978,36 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: sub_i32_varying_offset: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm +; GFX11-LABEL: sub_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX11W32-LABEL: sub_i32_varying_offset: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm -; -; GFX12W64-LABEL: sub_i32_varying_offset: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: sub_i32_varying_offset: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX12-LABEL: sub_i32_varying_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} -; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 720e2ef108076d..7e15c07f952697 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -23,10 +23,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 @@ -34,8 +34,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB0_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -52,10 +52,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 @@ -63,8 +63,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -81,10 +81,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 @@ -92,8 +92,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -109,10 +109,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 @@ -121,10 +121,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -134,26 +133,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -164,7 +162,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -172,7 +170,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -181,8 +179,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -196,25 +194,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -229,7 +227,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -237,7 +235,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -246,8 +244,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -261,24 +259,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: add_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -299,15 +297,15 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 @@ -315,8 +313,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -329,16 +327,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 @@ -346,8 +344,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -360,16 +358,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 @@ -377,8 +375,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -390,16 +388,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) @@ -408,10 +406,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -421,38 +418,38 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -460,7 +457,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -469,8 +466,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -484,42 +481,42 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -527,7 +524,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -536,8 +533,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -551,32 +548,32 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -589,8 +586,8 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc @@ -602,37 +599,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -644,37 +641,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -685,39 +682,38 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -727,38 +723,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -768,184 +763,178 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB2_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB2_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -959,8 +948,8 @@ entry: define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vindex: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc @@ -972,9 +961,9 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_vindex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 idxen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -985,9 +974,9 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_vindex: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -997,10 +986,9 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: add_i32_varying_vindex: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1008,67 +996,33 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: add_i32_varying_vindex: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm -; -; GFX11W32-LABEL: add_i32_varying_vindex: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm -; -; GFX12W64-LABEL: add_i32_varying_vindex: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: add_i32_varying_vindex: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX11-LABEL: add_i32_varying_vindex: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: add_i32_varying_vindex: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0) @@ -1080,10 +1034,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc @@ -1095,14 +1049,15 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s0, 0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1111,27 +1066,27 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1141,12 +1096,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W64-LABEL: add_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_mov_b32 s0, 0 -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 @@ -1158,12 +1113,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W32-LABEL: add_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_mov_b32 s2, 0 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 @@ -1175,12 +1130,11 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W64-LABEL: add_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v1, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1191,11 +1145,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W32-LABEL: add_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1218,10 +1171,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 @@ -1229,8 +1182,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1248,10 +1201,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 @@ -1259,8 +1212,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1278,10 +1231,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 @@ -1289,8 +1242,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1307,10 +1260,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1319,10 +1272,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1333,26 +1285,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1364,7 +1315,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1372,7 +1323,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1381,8 +1332,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1397,25 +1348,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1431,7 +1382,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1439,7 +1390,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1448,8 +1399,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1464,24 +1415,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: sub_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1503,15 +1454,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB6_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 @@ -1519,8 +1470,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB6_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1533,16 +1484,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 @@ -1550,8 +1501,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB6_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1564,16 +1515,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 @@ -1581,8 +1532,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1594,16 +1545,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) @@ -1612,8 +1563,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1625,39 +1576,39 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1665,7 +1616,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -1674,8 +1625,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB6_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1690,43 +1641,43 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1734,7 +1685,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1743,8 +1694,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1759,33 +1710,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1798,8 +1749,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 idxen glc @@ -1811,37 +1762,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB7_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB7_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -1853,37 +1804,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB7_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1894,39 +1845,38 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1936,38 +1886,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1977,186 +1926,180 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB7_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB7_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB7_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -2170,8 +2113,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vindex: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc @@ -2183,9 +2126,9 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_vindex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2196,9 +2139,9 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_vindex: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2208,10 +2151,9 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: sub_i32_varying_vindex: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2219,67 +2161,33 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: sub_i32_varying_vindex: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm -; -; GFX11W32-LABEL: sub_i32_varying_vindex: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm -; -; GFX12W64-LABEL: sub_i32_varying_vindex: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: sub_i32_varying_vindex: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX11-LABEL: sub_i32_varying_vindex: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sub_i32_varying_vindex: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0) @@ -2291,10 +2199,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc @@ -2306,14 +2214,15 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s0, 0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2322,27 +2231,27 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2352,12 +2261,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W64-LABEL: sub_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_mov_b32 s0, 0 -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 @@ -2369,12 +2278,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W32-LABEL: sub_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_mov_b32 s2, 0 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 @@ -2386,12 +2295,11 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W64-LABEL: sub_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v1, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -2402,11 +2310,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W32-LABEL: sub_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -2420,6 +2327,3 @@ entry: store i32 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} -; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index 417d38990505b6..2c69ae58f0e611 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -18,7 +18,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -33,7 +33,7 @@ entry: define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -42,7 +42,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -58,8 +58,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 @@ -72,8 +72,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5 @@ -92,7 +92,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -100,7 +100,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %a ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -114,7 +114,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 @@ -124,7 +124,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 @@ -140,11 +140,11 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -155,8 +155,8 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr ; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN @@ -175,7 +175,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -185,7 +185,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -201,7 +201,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -211,7 +211,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -227,7 +227,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { ; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16 @@ -238,7 +238,7 @@ define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ; ; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 14519f5a5e77c0..7da058ca6ee7e7 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"} ; Exactly 10 waves per execution unit. ; CHECK-LABEL: {{^}}exactly_10: -; CHECK: SGPRBlocks: 2 +; CHECK: SGPRBlocks: 1 ; CHECK: VGPRBlocks: 5 -; CHECK: NumSGPRsForWavesPerEU: 20 +; CHECK: NumSGPRsForWavesPerEU: 12 ; CHECK: NumVGPRsForWavesPerEU: 24 define amdgpu_kernel void @exactly_10() #9 { %val0 = load volatile float, ptr addrspace(1) @var diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll index 90562e25a3e9c1..b2f01660201d7e 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll @@ -1,6 +1,6 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O2 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=OPT %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=OPT %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV4 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV5 %s ; Check that AMDGPUAttributor is not run with -O0. ; OPT: .amdhsa_user_sgpr_private_segment_buffer 1 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 16ffdd7ebe421f..a86a3f6f279d7b 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -3781,21 +3781,21 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v2, s30, 0 ; GCN-NEXT: v_writelane_b32 v2, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen @@ -3806,27 +3806,27 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: v_writelane_b32 v2, s30, 0 ; GFX7-NEXT: v_writelane_b32 v2, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen @@ -3837,27 +3837,27 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 @@ -3866,27 +3866,27 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 @@ -3895,28 +3895,28 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 @@ -3926,7 +3926,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3968,21 +3968,21 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v2bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v4, s30, 0 ; GCN-NEXT: v_writelane_b32 v4, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, 2, v2 @@ -3998,27 +3998,27 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v2bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: v_writelane_b32 v4, s30, 0 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -4034,27 +4034,27 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v2bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 @@ -4063,27 +4063,27 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v2bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 @@ -4092,28 +4092,28 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 @@ -4123,7 +4123,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4165,21 +4165,21 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v3bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v5, s30, 0 ; GCN-NEXT: v_writelane_b32 v5, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4197,27 +4197,27 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v3bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: v_writelane_b32 v4, s30, 0 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4235,27 +4235,27 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v3bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: v_writelane_b32 v4, s30, 0 ; GFX8-NEXT: v_writelane_b32 v4, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 ; GFX8-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -4267,27 +4267,27 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v3bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen @@ -4298,28 +4298,28 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v3bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen @@ -4331,7 +4331,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4375,21 +4375,21 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v4bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v8, s30, 0 ; GCN-NEXT: v_writelane_b32 v8, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4415,27 +4415,27 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v4bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: v_writelane_b32 v6, s30, 0 ; GFX7-NEXT: v_writelane_b32 v6, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -4461,27 +4461,27 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v4bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: v_writelane_b32 v4, s30, 0 ; GFX8-NEXT: v_writelane_b32 v4, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 ; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -4493,27 +4493,27 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v4bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen @@ -4524,28 +4524,28 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v4bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen @@ -4557,7 +4557,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4599,21 +4599,21 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v8bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v16, s30, 0 ; GCN-NEXT: v_writelane_b32 v16, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4659,27 +4659,27 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v8bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: v_writelane_b32 v10, s30, 0 ; GFX7-NEXT: v_writelane_b32 v10, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -4725,27 +4725,27 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v8bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: v_writelane_b32 v6, s30, 0 ; GFX8-NEXT: v_writelane_b32 v6, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 12, v4 ; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -4763,27 +4763,27 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v8bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v5, s30, 0 ; GFX9-NEXT: v_writelane_b32 v5, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 @@ -4798,28 +4798,28 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v8bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v5, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v5, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 @@ -4835,7 +4835,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4877,21 +4877,21 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v16bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v21, s30, 0 ; GCN-NEXT: v_writelane_b32 v21, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4977,27 +4977,27 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v16bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0x400 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: v_writelane_b32 v18, s30, 0 ; GFX7-NEXT: v_writelane_b32 v18, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 @@ -5083,27 +5083,27 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v16bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0x400 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: v_writelane_b32 v10, s30, 0 ; GFX8-NEXT: v_writelane_b32 v10, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 28, v8 ; GFX8-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -5133,27 +5133,27 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v16bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v9, s30, 0 ; GFX9-NEXT: v_writelane_b32 v9, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 @@ -5176,28 +5176,28 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v16bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v9, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v9, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 @@ -5221,7 +5221,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -27297,7 +27297,7 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_and_b32 s4, s6, 0x80000000 +; GCN-NEXT: s_and_b32 s4, s4, 0x80000000 ; GCN-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_or_b32_e32 v0, s4, v0 @@ -27308,7 +27308,7 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_and_b32 s4, s6, 0x80000000 +; GFX7-NEXT: s_and_b32 s4, s4, 0x80000000 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, s4, v0 @@ -27318,23 +27318,23 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GFX8-LABEL: v_copysign_bf16_s_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX8-NEXT: s_movk_i32 s5, 0x7fff +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_bf16_s_bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_movk_i32 s5, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_bf16_s_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s6 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_copysign_bf16_s_bf16: @@ -27350,7 +27350,7 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GCN-LABEL: v_copysign_s_bf16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 @@ -27361,7 +27361,7 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GFX7-LABEL: v_copysign_s_bf16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 @@ -27372,23 +27372,23 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GFX8-LABEL: v_copysign_s_bf16_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX8-NEXT: s_movk_i32 s5, 0x7fff +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_bfi_b32 v0, s5, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_s_bf16_bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX9-NEXT: s_movk_i32 s5, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_s_bf16_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s6, v0 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_copysign_s_bf16_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll index 2c179de2a9c35c..0f20ed1320dad7 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll @@ -6,10 +6,10 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) { ; VI-LABEL: bfe_combine8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -24,11 +24,11 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) ; ; VI-SDWA-LABEL: bfe_combine8: ; VI-SDWA: ; %bb.0: -; VI-SDWA-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 2 ; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -42,13 +42,13 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) ; ; CI-LABEL: bfe_combine8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 6, v0 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: v_and_b32_e32 v0, 0x3fc, v0 @@ -71,11 +71,11 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x) { ; VI-LABEL: bfe_combine16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 16, 16 ; VI-NEXT: v_lshlrev_b32_e32 v0, 15, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -91,11 +91,11 @@ define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x ; ; VI-SDWA-LABEL: bfe_combine16: ; VI-SDWA: ; %bb.0: -; VI-SDWA-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 15 ; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 0 ; VI-SDWA-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -111,13 +111,13 @@ define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x ; ; CI-LABEL: bfe_combine16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; CI-NEXT: v_and_b32_e32 v0, 0x7fff8000, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index f54ea615ca6645..af4116bd6aae5d 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -115,7 +115,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -175,7 +175,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -221,7 +221,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -253,7 +253,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -313,7 +313,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -329,7 +329,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -373,7 +373,7 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -401,8 +401,8 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -417,8 +417,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ; ; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -444,8 +444,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -462,8 +462,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou ; ; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -491,8 +491,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -509,8 +509,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ; ; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll index 78d764898a3b93..7b8eaccaa4142b 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -11,50 +11,52 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_def_i32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_andn2_b32 s6, s6, s4 -; GFX7-NEXT: s_and_b32 s4, s5, s4 -; GFX7-NEXT: s_or_b32 s4, s6, s4 +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 +; GFX7-NEXT: s_andn2_b32 s4, s8, s6 +; GFX7-NEXT: s_and_b32 s5, s7, s6 +; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_def_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_andn2_b32 s2, s6, s4 -; GFX8-NEXT: s_and_b32 s3, s5, s4 -; GFX8-NEXT: s_or_b32 s2, s2, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_and_b32 s1, s7, s6 +; GFX8-NEXT: s_andn2_b32 s0, s0, s6 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_def_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s2, s6, s4 -; GFX10-NEXT: s_and_b32 s3, s5, s4 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_and_b32 s1, s7, s6 +; GFX10-NEXT: s_andn2_b32 s0, s0, s6 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_def_i32: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_and_b32 s1, s7, s6 ; GFX8-GISEL-NEXT: s_andn2_b32 s0, s0, s6 @@ -68,8 +70,8 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, ; GFX10-GISEL-LABEL: s_bfi_def_i32: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s6 @@ -130,50 +132,52 @@ entry: define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b32 s5, s5, s6 -; GFX7-NEXT: s_and_b32 s4, s4, s5 -; GFX7-NEXT: s_xor_b32 s4, s6, s4 +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_xor_b32 s4, s7, s8 +; GFX7-NEXT: s_and_b32 s4, s6, s4 +; GFX7-NEXT: s_xor_b32 s4, s8, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s2, s5, s6 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_xor_b32 s2, s6, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b32 s1, s7, s0 +; GFX8-NEXT: s_and_b32 s1, s6, s1 +; GFX8-NEXT: s_xor_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_sha256_ch: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b32 s2, s5, s6 -; GFX10-NEXT: s_and_b32 s2, s4, s2 -; GFX10-NEXT: s_xor_b32 s2, s6, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_xor_b32 s1, s7, s0 +; GFX10-NEXT: s_and_b32 s1, s6, s1 +; GFX10-NEXT: s_xor_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ch: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-GISEL-NEXT: s_xor_b32 s1, s7, s0 @@ -187,8 +191,8 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX10-GISEL-LABEL: s_bfi_sha256_ch: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_xor_b32 s1, s7, s0 @@ -454,53 +458,55 @@ entry: define amdgpu_kernel void @s_bfi_sha256_ma(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_sha256_ma: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s6 -; GFX7-NEXT: s_or_b32 s4, s4, s6 -; GFX7-NEXT: s_and_b32 s4, s5, s4 -; GFX7-NEXT: s_or_b32 s4, s7, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 +; GFX7-NEXT: s_or_b32 s5, s6, s8 +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_and_b32 s4, s6, s8 +; GFX7-NEXT: s_and_b32 s5, s7, s5 +; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_sha256_ma: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s3, s4, s6 -; GFX8-NEXT: s_and_b32 s2, s4, s6 -; GFX8-NEXT: s_and_b32 s3, s5, s3 -; GFX8-NEXT: s_or_b32 s2, s2, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_and_b32 s1, s6, s0 +; GFX8-NEXT: s_or_b32 s0, s6, s0 +; GFX8-NEXT: s_and_b32 s0, s7, s0 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_sha256_ma: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_or_b32 s2, s4, s6 -; GFX10-NEXT: s_and_b32 s3, s4, s6 -; GFX10-NEXT: s_and_b32 s2, s5, s2 -; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_or_b32 s1, s6, s0 +; GFX10-NEXT: s_and_b32 s0, s6, s0 +; GFX10-NEXT: s_and_b32 s1, s7, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ma: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s0 @@ -515,8 +521,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX10-GISEL-LABEL: s_bfi_sha256_ma: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_or_b32 s1, s6, s0 @@ -1402,8 +1408,8 @@ entry: define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_0: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1419,8 +1425,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-LABEL: s_bitselect_i64_pat_0: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1435,8 +1441,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1450,8 +1456,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1466,8 +1472,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_0: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1490,8 +1496,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1507,8 +1513,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-LABEL: s_bitselect_i64_pat_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1523,8 +1529,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1538,8 +1544,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1554,8 +1560,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1578,8 +1584,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1595,8 +1601,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-LABEL: s_bitselect_i64_pat_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1611,8 +1617,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1626,8 +1632,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1642,8 +1648,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_2: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1666,8 +1672,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX7-LABEL: s_bfi_sha256_ma_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1684,8 +1690,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; ; GFX8-LABEL: s_bfi_sha256_ma_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1] ; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] @@ -1701,8 +1707,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX10-LABEL: s_bfi_sha256_ma_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1] ; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] @@ -1717,8 +1723,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1] ; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] @@ -1734,8 +1740,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX10-GISEL-LABEL: s_bfi_sha256_ma_i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1] ; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll index 4ad3667f689583..0f40576a7459cc 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll @@ -283,7 +283,7 @@ define float @v_bfi_single_constant_as_partition(float %x, float %y, float %z) { define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: v_bfi_dont_applied_for_scalar_ops: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll index 2e64db12ef564c..f8bd44b7c98f59 100644 --- a/llvm/test/CodeGen/AMDGPU/bfm.ll +++ b/llvm/test/CodeGen/AMDGPU/bfm.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) #0 { ; SI-LABEL: s_bfm_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfm_b32 s2, s2, s3 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) ; ; VI-LABEL: s_bfm_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfm_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -36,11 +36,11 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #0 { ; SI-LABEL: s_bfm_pattern_simple: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfm_b32 s4, s4, 0 +; SI-NEXT: s_bfm_b32 s4, s2, 0 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -48,10 +48,10 @@ define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) # ; ; VI-LABEL: s_bfm_pattern_simple: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfm_b32 s2, s4, 0 +; VI-NEXT: s_bfm_b32 s2, s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index 6f52da2631b8a6..64555f14a55cc1 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -21,8 +21,8 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #0 { ; SI-LABEL: s_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -34,8 +34,8 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; FLAT-LABEL: s_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; GISEL-LABEL: s_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s2, s4, 0xffff +; GISEL-NEXT: s_and_b32 s2, s2, 0xffff ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -62,10 +62,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-FLAT-LABEL: s_brev_i16: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] @@ -76,11 +76,11 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-LABEL: s_brev_i16: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 ; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16 @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -117,7 +117,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_mov_b32 s10, s6 @@ -136,7 +136,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i16: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: v_mov_b32_e32 v1, 0 @@ -168,7 +168,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] @@ -187,8 +187,8 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #0 { ; SI-LABEL: s_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -199,8 +199,8 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; ; FLAT-LABEL: s_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -211,10 +211,10 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; ; GISEL-LABEL: s_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_brev_b32 s2, s4 +; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -224,11 +224,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-FLAT-LABEL: s_brev_i32: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 @@ -240,11 +240,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-GISEL-LABEL: s_brev_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_brev_b32 s2, s4 +; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -259,7 +259,7 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -278,7 +278,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -294,7 +294,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -311,9 +311,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b32 v0, v0, s[2:3] @@ -328,10 +326,8 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -351,7 +347,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> %val) #0 { ; SI-LABEL: s_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -366,7 +362,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -381,7 +377,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GISEL-LABEL: s_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_brev_b32 s3, s3 @@ -394,7 +390,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-FLAT-LABEL: s_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -411,7 +407,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-GISEL-LABEL: s_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 @@ -430,7 +426,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -450,7 +446,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -467,7 +463,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -485,9 +481,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -503,11 +497,9 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -528,7 +520,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #0 { ; SI-LABEL: s_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -542,7 +534,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -556,7 +548,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GISEL-LABEL: s_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -568,7 +560,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-FLAT-LABEL: s_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[2:3] ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 @@ -581,7 +573,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-GISEL-LABEL: s_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] @@ -599,7 +591,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -619,7 +611,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -636,7 +628,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -654,9 +646,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -672,9 +662,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -697,8 +685,8 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) #0 { ; SI-LABEL: s_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -713,8 +701,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; ; FLAT-LABEL: s_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -729,8 +717,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; ; GISEL-LABEL: s_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] ; GISEL-NEXT: s_brev_b64 s[2:3], s[6:7] @@ -746,8 +734,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-FLAT-LABEL: s_brev_v2i64: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[4:5] ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[6:7] @@ -763,8 +751,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-GISEL-LABEL: s_brev_v2i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] @@ -783,7 +771,7 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -805,7 +793,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -824,7 +812,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -844,9 +832,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b128 v[0:3], v0, s[2:3] @@ -864,9 +850,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b128 v[0:3], v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll index 857b13fab8a7ce..3dbbb877918ad2 100644 --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -6,18 +6,18 @@ define amdgpu_kernel void @br_cc_f16( ; SI-LABEL: br_cc_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -29,28 +29,28 @@ define amdgpu_kernel void @br_cc_f16( ; SI-NEXT: .LBB0_2: ; %two ; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: .LBB0_3: ; %one -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: br_cc_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_cbranch_vccnz .LBB0_2 ; VI-NEXT: ; %bb.1: ; %one @@ -63,8 +63,8 @@ define amdgpu_kernel void @br_cc_f16( ; GFX11-LABEL: br_cc_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s2 @@ -111,7 +111,7 @@ two: define amdgpu_kernel void @br_cc_f16_imm_a( ; SI-LABEL: br_cc_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -137,7 +137,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; ; VI-LABEL: br_cc_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -157,7 +157,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; ; GFX11-LABEL: br_cc_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -195,7 +195,7 @@ two: define amdgpu_kernel void @br_cc_f16_imm_b( ; SI-LABEL: br_cc_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -221,7 +221,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; ; VI-LABEL: br_cc_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; ; GFX11-LABEL: br_cc_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index adfc177c8bf749..6201d7341898f5 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -4,10 +4,10 @@ define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { ; CHECK-LABEL: spill: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s44, s[6:7], 0x2 +; CHECK-NEXT: s_load_dword s44, s[4:5], 0x2 ; CHECK-NEXT: s_mov_b64 s[98:99], s[2:3] ; CHECK-NEXT: s_mov_b64 s[96:97], s[0:1] -; CHECK-NEXT: s_add_u32 s96, s96, s13 +; CHECK-NEXT: s_add_u32 s96, s96, s7 ; CHECK-NEXT: s_addc_u32 s97, s97, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_eq_u32 s44, 0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 635f3e4886b875..2f637df4e93022 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -22,9 +22,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_max_short_forward_branch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART @@ -34,10 +34,10 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_sleep 0 ; GCN-NEXT: .LBB0_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -63,9 +63,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.3: ; %bb0 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -81,10 +81,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -110,9 +110,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0 +; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s2, 0 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_cbranch_vccz .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %bb0 @@ -130,10 +130,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -158,7 +158,7 @@ bb3: define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: min_long_forward_vbranch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -261,28 +261,28 @@ bb3: define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { ; GCN-LABEL: uniform_unconditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 -; GCN-NEXT: s_mov_b64 s[0:1], -1 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc0 .LBB5_1 ; GCN-NEXT: ; %bb.7: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_getpc_b64 s[2:3] ; GCN-NEXT: .Lpost_getpc5: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB5_4-.Lpost_getpc5)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB5_4-.Lpost_getpc5)>>32 -; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_4-.Lpost_getpc5)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_4-.Lpost_getpc5)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] ; GCN-NEXT: .LBB5_1: ; %Flow -; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_cbranch_vccnz .LBB5_3 ; GCN-NEXT: .LBB5_2: ; %bb2 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 17 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB5_3: ; %bb4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -300,17 +300,17 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_cbranch_execnz .LBB5_5 ; GCN-NEXT: ; %bb.9: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_getpc_b64 s[2:3] ; GCN-NEXT: .Lpost_getpc6: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc6)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc6)>>32 -; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_2-.Lpost_getpc6)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_2-.Lpost_getpc6)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] ; GCN-NEXT: .LBB5_5: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_getpc_b64 s[2:3] ; GCN-NEXT: .Lpost_getpc4: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB5_3-.Lpost_getpc4)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB5_3-.Lpost_getpc4)>>32 -; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_3-.Lpost_getpc4)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_3-.Lpost_getpc4)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -375,7 +375,7 @@ loop: define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { ; GCN-LABEL: expand_requires_expand: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lt_i32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -453,8 +453,8 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_addc_u32 s1, s1, (.LBB8_3-.Lpost_getpc9)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB8_1: ; %if -; GCN-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -572,10 +572,10 @@ ret: define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 { ; GCN-LABEL: long_branch_hang: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s4, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_cmp_lt_i32 s7, 6 @@ -607,25 +607,25 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GCN-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-NEXT: ; %bb.10: ; %Flow5 -; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_getpc_b64 s[2:3] ; GCN-NEXT: .Lpost_getpc13: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc13)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB10_6-.Lpost_getpc13)>>32 -; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s2, s2, (.LBB10_6-.Lpost_getpc13)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB10_6-.Lpost_getpc13)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] ; GCN-NEXT: .LBB10_5: ; %bb14 ; GCN-NEXT: s_cmp_lt_i32 s5, 9 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_cmp_lt_i32 s6, s7 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GCN-NEXT: s_branch .LBB10_7 ; GCN-NEXT: .LBB10_6: ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB10_7: ; %bb19 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xf -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xf +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 321a7ceb826f6e..e4c7df385d8619 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -19,7 +19,7 @@ declare i48 @llvm.bswap.i48(i48) #1 define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -34,7 +34,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_bswap_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_bswap_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -69,7 +69,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -87,7 +87,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -103,7 +103,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -124,7 +124,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -148,7 +148,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -166,7 +166,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -189,7 +189,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -226,7 +226,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 0x10203 ; VI-NEXT: s_mov_b32 s15, 0xf000 ; VI-NEXT: s_mov_b32 s14, -1 @@ -249,7 +249,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v8i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -296,7 +296,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_bswap_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -312,7 +312,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_bswap_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -333,7 +333,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -357,7 +357,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -375,7 +375,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v2i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -398,7 +398,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -435,7 +435,7 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 0x10203 ; VI-NEXT: s_mov_b32 s15, 0xf000 ; VI-NEXT: s_mov_b32 s14, -1 @@ -458,7 +458,7 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v4i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 0cdd6b919f1c8b..d50ba64ba5d47f 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -13,35 +13,35 @@ ; float ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -49,19 +49,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -70,44 +66,36 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -115,30 +103,26 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -146,30 +130,26 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -177,30 +157,26 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -209,51 +185,51 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB0_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %result } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: +define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -261,18 +237,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -280,117 +252,97 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -398,25 +350,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } -define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: +define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -449,7 +401,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[2:3], exec @@ -477,7 +429,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -507,7 +459,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 @@ -569,7 +521,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad ; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[6:7], exec @@ -595,7 +547,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 @@ -653,7 +605,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad ; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 @@ -711,7 +663,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad ; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 @@ -768,7 +720,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 @@ -826,4033 +778,2324 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: s_add_i32 s4, s18, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s6, s18, 0x800 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s6, s18, 0x800 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: +define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1] -; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x800 -; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v10, v5 -; GFX10-NEXT: v_mov_b32_e32 v9, v4 +; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v4, v7 -; GFX10-NEXT: v_mov_b32_e32 v5, v8 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX908-NEXT: v_mov_b32_e32 v10, v5 -; GFX908-NEXT: v_mov_b32_e32 v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v7 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, v5 -; GFX8-NEXT: v_mov_b32_e32 v9, v4 -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v7 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s6, s18, 0x800 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v10, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, v8 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s6, s18, 0x800 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v5 -; GFX6-NEXT: v_mov_b32_e32 v9, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v5, v8 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: +define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6] -; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB5_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v6 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB5_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v9 -; GFX10-NEXT: v_readfirstlane_b32 s9, v10 -; GFX10-NEXT: v_readfirstlane_b32 s10, v7 -; GFX10-NEXT: v_readfirstlane_b32 s11, v8 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, v11 -; GFX10-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-NEXT: v_mov_b32_e32 v3, v14 -; GFX10-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_readfirstlane_b32 s8, v9 -; GFX10-NEXT: v_readfirstlane_b32 s9, v10 -; GFX10-NEXT: v_readfirstlane_b32 s10, v7 -; GFX10-NEXT: v_readfirstlane_b32 s11, v8 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX10-NEXT: v_mov_b32_e32 v14, v1 -; GFX10-NEXT: v_mov_b32_e32 v13, v0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB5_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 -; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX908-NEXT: v_mov_b32_e32 v14, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB5_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 -; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX8-NEXT: v_mov_b32_e32 v14, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB5_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4 -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v9 -; GFX7-NEXT: v_readfirstlane_b32 s9, v10 -; GFX7-NEXT: v_readfirstlane_b32 s10, v7 -; GFX7-NEXT: v_readfirstlane_b32 s11, v8 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v0, v11 -; GFX7-NEXT: v_mov_b32_e32 v1, v12 -; GFX7-NEXT: v_mov_b32_e32 v2, v13 -; GFX7-NEXT: v_mov_b32_e32 v3, v14 -; GFX7-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX7-NEXT: v_readfirstlane_b32 s8, v9 -; GFX7-NEXT: v_readfirstlane_b32 s9, v10 -; GFX7-NEXT: v_readfirstlane_b32 s10, v7 -; GFX7-NEXT: v_readfirstlane_b32 s11, v8 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX7-NEXT: v_mov_b32_e32 v14, v1 -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v13, v0 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB5_3 -; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4 -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v9 -; GFX6-NEXT: v_readfirstlane_b32 s9, v10 -; GFX6-NEXT: v_readfirstlane_b32 s10, v7 -; GFX6-NEXT: v_readfirstlane_b32 s11, v8 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] -; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v11 -; GFX6-NEXT: v_mov_b32_e32 v1, v12 -; GFX6-NEXT: v_mov_b32_e32 v2, v13 -; GFX6-NEXT: v_mov_b32_e32 v3, v14 -; GFX6-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX6-NEXT: v_readfirstlane_b32 s8, v9 -; GFX6-NEXT: v_readfirstlane_b32 s9, v10 -; GFX6-NEXT: v_readfirstlane_b32 s10, v7 -; GFX6-NEXT: v_readfirstlane_b32 s11, v8 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX6-NEXT: v_mov_b32_e32 v14, v1 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v13, v0 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB5_3 -; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB5_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + ret float %result } -; -------------------------------------------------------------------- -; half -; -------------------------------------------------------------------- - -define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: +define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 ; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[8:11], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v5, s4 -; GFX908-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 -; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 -; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s7, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst - ret half %result + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret float %result } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: +define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 ; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[8:11], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, s4 -; GFX908-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 -; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 -; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %result } -define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v11, v7 -; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB8_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v11, v6 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB8_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX940-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v8 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB8_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX11-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v11, v7 -; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB8_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v11, v7 -; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: s_addk_i32 s8, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v9, v7 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 -; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 -; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB8_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v11, v6 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX90A-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 -; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v8 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_3 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v11, v6 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB8_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX908-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX908-NEXT: v_mov_b32_e32 v9, v7 -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v8, v6 -; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB8_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v11, v6 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB8_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX8-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, v7 -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v8, v6 -; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB8_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX7-NEXT: v_not_b32_e32 v9, v4 -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s10, s8, 0x800 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 -; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB8_3 -; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_not_b32_e32 v9, v4 -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB8_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 -; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s10, s8, 0x800 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s10 +; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB8_3 -; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst - ret half %result + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -; -------------------------------------------------------------------- -; bfloat -; -------------------------------------------------------------------- - -define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_execnz .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v10, v5 +; GFX10-NEXT: v_mov_b32_e32 v9, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v4, v7 +; GFX10-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v10, v5 +; GFX908-NEXT: v_mov_b32_e32 v9, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v7 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: v_mov_b32_e32 v5, v8 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v10, v5 +; GFX8-NEXT: v_mov_b32_e32 v9, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v7 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v5, v8 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s10, s8, 0x800 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v10, v5 +; GFX7-NEXT: v_mov_b32_e32 v9, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s10, s8, 0x800 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s10 ; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v10, v5 +; GFX6-NEXT: v_mov_b32_e32 v9, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst - ret bfloat %result + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: +define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB10_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6] +; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX12-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB10_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB10_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 -; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB10_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX11-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB10_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB10_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v9 +; GFX10-NEXT: v_readfirstlane_b32 s9, v10 +; GFX10-NEXT: v_readfirstlane_b32 s10, v7 +; GFX10-NEXT: v_readfirstlane_b32 s11, v8 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: .LBB10_3: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Loop Header: Depth=1 +; GFX10-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v0, v11 +; GFX10-NEXT: v_mov_b32_e32 v1, v12 +; GFX10-NEXT: v_mov_b32_e32 v2, v13 +; GFX10-NEXT: v_mov_b32_e32 v3, v14 +; GFX10-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 +; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX10-NEXT: v_readfirstlane_b32 s8, v9 +; GFX10-NEXT: v_readfirstlane_b32 s9, v10 +; GFX10-NEXT: v_readfirstlane_b32 s10, v7 +; GFX10-NEXT: v_readfirstlane_b32 s11, v8 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB10_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX10-NEXT: v_mov_b32_e32 v14, v1 +; GFX10-NEXT: v_mov_b32_e32 v13, v0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_cbranch_execnz .LBB10_3 +; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB10_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB10_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX908-NEXT: v_mov_b32_e32 v14, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB10_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB10_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB10_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB10_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4 +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v9 +; GFX7-NEXT: v_readfirstlane_b32 s9, v10 +; GFX7-NEXT: v_readfirstlane_b32 s10, v7 +; GFX7-NEXT: v_readfirstlane_b32 s11, v8 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB10_3: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Loop Header: Depth=1 +; GFX7-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v0, v11 +; GFX7-NEXT: v_mov_b32_e32 v1, v12 +; GFX7-NEXT: v_mov_b32_e32 v2, v13 +; GFX7-NEXT: v_mov_b32_e32 v3, v14 +; GFX7-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 +; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX7-NEXT: v_readfirstlane_b32 s8, v9 +; GFX7-NEXT: v_readfirstlane_b32 s9, v10 +; GFX7-NEXT: v_readfirstlane_b32 s10, v7 +; GFX7-NEXT: v_readfirstlane_b32 s11, v8 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB10_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 +; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX7-NEXT: v_mov_b32_e32 v14, v1 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v13, v0 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB10_3 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4 +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v9 +; GFX6-NEXT: v_readfirstlane_b32 s9, v10 +; GFX6-NEXT: v_readfirstlane_b32 s10, v7 +; GFX6-NEXT: v_readfirstlane_b32 s11, v8 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: .LBB10_3: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Loop Header: Depth=1 +; GFX6-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v11 +; GFX6-NEXT: v_mov_b32_e32 v1, v12 +; GFX6-NEXT: v_mov_b32_e32 v2, v13 +; GFX6-NEXT: v_mov_b32_e32 v3, v14 +; GFX6-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 +; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX6-NEXT: v_readfirstlane_b32 s8, v9 +; GFX6-NEXT: v_readfirstlane_b32 s9, v10 +; GFX6-NEXT: v_readfirstlane_b32 s10, v7 +; GFX6-NEXT: v_readfirstlane_b32 s11, v8 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB10_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1 +; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX6-NEXT: v_mov_b32_e32 v14, v1 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v13, v0 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_cbranch_execnz .LBB10_3 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst - ret void + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: +define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB11_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX940-NEXT: v_not_b32_e32 v10, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB11_4 Depth 2 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_add_f32_e32 v4, v4, v11 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB11_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB11_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: s_addk_i32 s8, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 -; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB11_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 -; GFX90A-NEXT: v_not_b32_e32 v10, v4 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v11 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 -; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 -; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_3 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX908-NEXT: v_not_b32_e32 v9, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB11_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB11_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX8-NEXT: v_not_b32_e32 v9, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB11_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB11_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX7-NEXT: v_not_b32_e32 v9, v4 -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB11_4 Depth 2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 -; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s10, s8, 0x800 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB11_3 -; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_not_b32_e32 v9, v4 -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s10, s8, 0x800 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s10 +; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB11_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 -; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB11_3 -; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst - ret bfloat %result + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result } -; -------------------------------------------------------------------- -; <2 x half> -; -------------------------------------------------------------------- - -define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -4861,2455 +3104,8254 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: s_addk_i32 s8, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s10, s8, 0x800 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s10, s8, 0x800 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s10 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret double %result } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; -------------------------------------------------------------------- +; half +; -------------------------------------------------------------------- + +define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v5, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v5, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s9 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3 +; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v5, s9 +; GFX908-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2 +; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2 +; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v3, s11, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst - ret void + %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret half %result } -define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: +define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v3, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v5 ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v3, s5 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v7, v8, v5 -; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 -; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB14_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v3, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v7, v8, v5 -; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v6, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 -; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 -; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB14_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s9 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3 +; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB14_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v7, v8, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v7, v8 -; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v3, s9 +; GFX908-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2 +; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB14_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB14_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v6, v8, v5 -; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_mov_b32_e32 v6, v7 -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v7, v8 -; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2 +; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v4, s11, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB14_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 -; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB14_4 Depth 2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v10 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 -; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB14_3 -; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB14_4 Depth 2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v10 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v11 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 -; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB14_3 -; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result + %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -; -------------------------------------------------------------------- -; <2 x bfloat> -; -------------------------------------------------------------------- - -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-NEXT: v_not_b32_e32 v11, v7 +; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX940-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v11, v6 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Loop Header: Depth=1 +; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX940-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX940-NEXT: s_mov_b64 s[8:9], exec +; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB15_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v7, v8 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB15_3 +; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-NEXT: v_not_b32_e32 v11, v7 +; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v11, v7 +; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Loop Header: Depth=1 +; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX10-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB15_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_cbranch_execnz .LBB15_3 +; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v11, v6 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Loop Header: Depth=1 +; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX90A-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX90A-NEXT: s_mov_b64 exec, s[12:13] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v7, v8 +; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 +; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v11, v6 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX908-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX908-NEXT: v_mov_b32_e32 v9, v7 +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB15_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_cbranch_execnz .LBB15_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 +; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v11, v6 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX8-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, v7 +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB15_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v8 +; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_cbranch_execnz .LBB15_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 +; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_not_b32_e32 v9, v4 +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Loop Header: Depth=1 +; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB15_3 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 +; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_not_b32_e32 v9, v4 +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Loop Header: Depth=1 +; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB15_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_cbranch_execnz .LBB15_3 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst - ret <2 x bfloat> %result + %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret half %result } -define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; -------------------------------------------------------------------- +; bfloat +; -------------------------------------------------------------------- + +define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 -; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s5 +; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v4, s5 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v4, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s9 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s9 +; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: v_mov_b32_e32 v1, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst - ret void + %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: +define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 -; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 -; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s5 +; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s4, s4, 3 +; GFX940-NEXT: s_lshl_b32 s6, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX940-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_mov_b64 s[8:9], exec +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v10 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 -; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 -; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 -; GFX940-NEXT: s_mov_b64 exec, s[8:9] +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB17_3 -; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 -; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB17_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 -; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB17_3 -; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 -; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 -; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 -; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 -; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s9 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 -; GFX90A-NEXT: s_mov_b64 exec, s[12:13] +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX908-NEXT: s_mov_b32 s15, 0x7060302 -; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB17_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 -; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v0 -; GFX908-NEXT: v_readfirstlane_b32 s9, v1 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: v_readfirstlane_b32 s11, v3 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB17_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-NEXT: v_not_b32_e32 v9, v6 +; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 +; GFX940-NEXT: v_not_b32_e32 v10, v4 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX940-NEXT: s_movk_i32 s10, 0x7fff +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Loop Header: Depth=1 +; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_mov_b64 s[8:9], exec +; GFX940-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX940-NEXT: s_mov_b64 exec, s[8:9] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-NEXT: v_not_b32_e32 v9, v6 +; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX10-NEXT: v_not_b32_e32 v9, v6 +; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Loop Header: Depth=1 +; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 +; GFX90A-NEXT: v_not_b32_e32 v10, v4 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX90A-NEXT: s_movk_i32 s14, 0x7fff +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Loop Header: Depth=1 +; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX90A-NEXT: s_mov_b64 exec, s[12:13] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 +; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX908-NEXT: v_not_b32_e32 v9, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX908-NEXT: s_movk_i32 s14, 0x7fff +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 +; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 +; GFX8-NEXT: v_not_b32_e32 v9, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB18_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 +; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX7-NEXT: v_not_b32_e32 v9, v4 +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Loop Header: Depth=1 +; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 +; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_not_b32_e32 v9, v4 +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Loop Header: Depth=1 +; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB18_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_cbranch_execnz .LBB18_3 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result +} + +; -------------------------------------------------------------------- +; <2 x half> +; -------------------------------------------------------------------- + +define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB19_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB20_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v5 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v7, v8, v5 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Loop Header: Depth=1 +; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v7, v8, v5 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 +; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v7, v8, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v7, v8 +; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v6, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v7, v8 +; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Loop Header: Depth=1 +; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v10 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v11 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB21_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Loop Header: Depth=1 +; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v10 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v11 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB21_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_cbranch_execnz .LBB21_3 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret <2 x half> %result +} + +define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret void +} + +; -------------------------------------------------------------------- +; <2 x bfloat> +; -------------------------------------------------------------------- + +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[6:7], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 +; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 +; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v1, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[6:7], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX908-NEXT: v_mov_b32_e32 v6, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, v5 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX940-NEXT: s_movk_i32 s10, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX940-NEXT: s_mov_b32 s11, 0x7060302 +; GFX940-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Loop Header: Depth=1 +; GFX940-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX940-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: s_mov_b64 s[8:9], exec +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v10 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 +; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB28_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX940-NEXT: s_mov_b64 exec, s[8:9] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB28_3 +; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB28_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB28_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX10-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Loop Header: Depth=1 +; GFX10-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: s_mov_b32 s6, exec_lo +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB28_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB28_3 +; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX90A-NEXT: s_movk_i32 s14, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 +; GFX90A-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Loop Header: Depth=1 +; GFX90A-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 +; GFX90A-NEXT: s_mov_b64 s[12:13], exec +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB28_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX90A-NEXT: s_mov_b64 exec, s[12:13] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB28_3 +; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX908-NEXT: s_movk_i32 s14, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX908-NEXT: s_mov_b32 s15, 0x7060302 +; GFX908-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 +; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 +; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v0 +; GFX908-NEXT: v_readfirstlane_b32 s9, v1 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB28_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB28_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX8-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s9, v1 +; GFX8-NEXT: v_readfirstlane_b32 s10, v2 +; GFX8-NEXT: v_readfirstlane_b32 s11, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB28_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB28_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX7-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Loop Header: Depth=1 +; GFX7-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB28_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB28_3 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX6-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Loop Header: Depth=1 +; GFX6-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB28_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_cbranch_execnz .LBB28_3 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[6:7], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 +; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 +; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v1, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret <2 x bfloat> %result +} + +define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[6:7], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX908-NEXT: v_mov_b32_e32 v6, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[6:7], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 +; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 +; GFX908-NEXT: v_mov_b32_e32 v0, v5 +; GFX908-NEXT: v_mov_b32_e32 v1, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-NEXT: v_mov_b32_e32 v1, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret <2 x bfloat> %result +} + +define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[6:7], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX908-NEXT: v_mov_b32_e32 v6, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret void +} + +define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_addk_i32 s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[6:7], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX940-NEXT: s_mov_b32 s9, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX908-NEXT: v_mov_b32_e32 v6, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s9, v1 -; GFX8-NEXT: v_readfirstlane_b32 s10, v2 -; GFX8-NEXT: v_readfirstlane_b32 s11, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB17_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB17_4 Depth 2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 -; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 -; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB17_3 -; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB17_4 Depth 2 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v9 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 -; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 -; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 -; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB17_3 -; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst - ret <2 x bfloat> %result + %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret void } ; -------------------------------------------------------------------- ; misc ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: +define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -7324,31 +11366,32 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -7364,25 +11407,21 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -7390,157 +11429,137 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v0 ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v0 @@ -7548,22 +11567,22 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fadd ptr addrspace(7) %gep, float %val seq_cst + %result = atomicrmw fadd ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } - +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 503065cc076477..06dee9c279f2c5 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -13,28 +13,28 @@ ; float ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -57,10 +57,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -68,35 +68,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -104,31 +96,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -137,31 +125,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -170,69 +154,61 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -255,10 +231,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -266,65 +242,53 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -332,31 +296,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -364,51 +324,43 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -441,7 +393,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -502,7 +454,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -532,7 +484,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, exec_lo @@ -561,7 +513,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -620,7 +572,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 @@ -680,7 +632,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 @@ -740,7 +692,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[6:7], exec @@ -766,7 +718,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[6:7], exec @@ -793,861 +745,1549 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret float %result } -define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] -; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB5_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v6 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB5_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, v3 +; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_mov_b32_e32 v8, v1 +; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB7_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX908-NEXT: v_mov_b32_e32 v14, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v13, v0 +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB7_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v13, v0 +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB7_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-NEXT: v_mov_b32_e32 v1, v6 +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, v6 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v10, v1 ; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 -; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX908-NEXT: v_mov_b32_e32 v14, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB5_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 -; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX8-NEXT: v_mov_b32_e32 v14, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB5_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-NEXT: v_mov_b32_e32 v1, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %result } @@ -1655,27 +2295,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a ; half ; -------------------------------------------------------------------- -define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1697,26 +2336,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s5 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1733,29 +2372,28 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v4, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1778,289 +2416,264 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v4, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s9 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s9 +; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2082,25 +2695,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s5 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2117,28 +2730,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2161,261 +2773,237 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s9 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2431,7 +3019,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -2444,14 +3032,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -2466,7 +3054,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -2481,8 +3069,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2491,13 +3079,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB8_3 +; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2508,7 +3096,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -2520,14 +3108,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2537,7 +3125,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -2551,8 +3139,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2560,13 +3148,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB8_3 +; GFX940-NEXT: s_cbranch_execnz .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2579,7 +3167,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -2592,14 +3180,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -2614,7 +3202,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -2629,8 +3217,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2640,13 +3228,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB8_3 +; GFX11-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2657,7 +3245,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -2669,13 +3257,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -2686,7 +3274,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -2700,8 +3288,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2711,13 +3299,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB8_3 +; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2728,7 +3316,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -2740,14 +3328,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2756,7 +3344,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -2769,8 +3357,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2778,13 +3366,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2795,7 +3383,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -2807,14 +3395,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2824,7 +3412,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -2837,8 +3425,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2846,13 +3434,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB8_3 +; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -2863,7 +3451,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -2875,14 +3463,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2893,7 +3481,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -2906,8 +3494,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2915,13 +3503,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB8_3 +; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -2931,7 +3519,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -2942,15 +3530,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -2962,7 +3550,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -2975,8 +3563,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2984,14 +3572,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB8_3 +; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -3001,7 +3589,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -3012,15 +3600,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -3032,7 +3620,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -3045,8 +3633,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3054,7 +3642,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB8_3 +; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -3062,7 +3650,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -3070,27 +3658,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; bfloat ; -------------------------------------------------------------------- -define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3119,27 +3706,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s5 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3161,29 +3748,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v4, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3213,33 +3799,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v4, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 @@ -3247,133 +3829,121 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s9 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s9 +; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v3, v3, v5 @@ -3383,141 +3953,132 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3546,26 +4107,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s5 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3587,28 +4148,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3638,32 +4198,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 @@ -3671,130 +4227,118 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s9 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 @@ -3804,117 +4348,109 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3930,7 +4466,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -3943,14 +4479,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -3972,7 +4508,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3987,8 +4523,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -3997,13 +4533,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB11_3 +; GFX12-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4014,7 +4550,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -4026,15 +4562,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_mov_b64 s[8:9], exec @@ -4049,7 +4585,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -4063,8 +4599,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4072,13 +4608,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB11_3 +; GFX940-NEXT: s_cbranch_execnz .LBB15_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4091,7 +4627,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -4104,15 +4640,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -4134,7 +4670,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -4149,8 +4685,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4160,14 +4696,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4178,7 +4714,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -4190,13 +4726,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -4211,7 +4747,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -4225,8 +4761,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4236,13 +4772,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB11_3 +; GFX10-NEXT: s_cbranch_execnz .LBB15_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4253,7 +4789,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -4265,15 +4801,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v11 @@ -4286,7 +4822,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -4299,8 +4835,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4308,13 +4844,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4325,7 +4861,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -4337,15 +4873,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v4, v4, v10 @@ -4359,7 +4895,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -4372,8 +4908,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4381,13 +4917,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB11_3 +; GFX908-NEXT: s_cbranch_execnz .LBB15_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -4398,7 +4934,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -4410,14 +4946,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v4, v4, v10 @@ -4433,7 +4969,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -4446,8 +4982,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4455,13 +4991,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB11_3 +; GFX8-NEXT: s_cbranch_execnz .LBB15_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4471,7 +5007,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -4482,15 +5018,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4503,7 +5039,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -4516,8 +5052,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4525,14 +5061,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB11_3 +; GFX7-NEXT: s_cbranch_execnz .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4542,7 +5078,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -4553,15 +5089,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4574,7 +5110,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -4587,8 +5123,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4596,7 +5132,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB11_3 +; GFX6-NEXT: s_cbranch_execnz .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -4604,7 +5140,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -4612,22 +5148,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -4644,22 +5180,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -4674,22 +5210,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -4707,26 +5243,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -4735,65 +5267,57 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -4801,33 +5325,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -4838,38 +5358,34 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4885,41 +5401,37 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4936,40 +5448,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 @@ -4985,21 +5497,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5014,21 +5526,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5045,25 +5557,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 ; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5071,97 +5579,85 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -5171,39 +5667,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5219,41 +5711,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5270,27 +5758,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5299,7 +5787,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -5313,14 +5801,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -5329,7 +5817,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -5344,8 +5832,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5354,18 +5842,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB14_3 +; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -5378,21 +5866,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -5406,8 +5894,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5415,19 +5903,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB14_3 +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -5441,14 +5929,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -5457,7 +5945,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -5472,8 +5960,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5483,19 +5971,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB14_3 +; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -5508,13 +5996,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -5522,7 +6010,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -5536,8 +6024,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5547,18 +6035,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB14_3 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -5571,20 +6059,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -5597,8 +6085,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5606,18 +6094,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -5630,21 +6118,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX908-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -5657,8 +6145,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5666,18 +6154,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB14_3 +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -5690,15 +6178,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 -; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 @@ -5708,7 +6196,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -5721,8 +6209,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5730,18 +6218,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB14_3 +; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -5753,7 +6241,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5765,9 +6253,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[12:13], exec @@ -5783,7 +6271,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -5796,8 +6284,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5807,19 +6295,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB14_3 +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -5831,7 +6319,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5843,9 +6331,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec @@ -5862,7 +6350,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -5875,8 +6363,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5886,7 +6374,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB14_3 +; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v4 @@ -5894,7 +6382,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -5902,23 +6390,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 @@ -5951,25 +6439,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_addk_i32 s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v0 @@ -5996,16 +6484,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cbranch_execnz .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -6014,7 +6502,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v0 @@ -6048,28 +6536,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -6085,42 +6569,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -6135,40 +6615,36 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -6183,39 +6659,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6232,44 +6704,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6283,39 +6751,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6330,39 +6794,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6393,24 +6857,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_addk_i32 s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6437,23 +6901,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cbranch_execnz .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6485,27 +6949,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6520,42 +6980,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6569,40 +7025,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6616,39 +7068,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6664,45 +7112,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6716,39 +7160,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6763,26 +7203,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6791,7 +7231,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -6805,15 +7245,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -6837,7 +7277,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -6852,8 +7292,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -6862,18 +7302,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB17_3 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -6886,7 +7326,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 @@ -6894,9 +7334,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_movk_i32 s10, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX940-NEXT: v_max_f32_e32 v4, v4, v9 @@ -6917,7 +7357,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -6931,8 +7371,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -6940,19 +7380,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB17_3 +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -6966,16 +7406,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -6999,7 +7439,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7014,8 +7454,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7025,20 +7465,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB17_3 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -7051,14 +7491,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 @@ -7079,7 +7519,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -7093,8 +7533,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7104,18 +7544,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB17_3 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -7128,7 +7568,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 @@ -7136,9 +7576,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 -; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v9 @@ -7157,7 +7597,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -7170,8 +7610,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -7179,18 +7619,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -7203,7 +7643,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 @@ -7211,9 +7651,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_movk_i32 s14, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 -; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX908-NEXT: v_max_f32_e32 v4, v4, v8 @@ -7233,7 +7673,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -7246,8 +7686,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7255,18 +7695,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB17_3 +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -7279,15 +7719,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v8 @@ -7310,7 +7750,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -7323,8 +7763,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7332,18 +7772,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB17_3 +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -7355,7 +7795,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7366,9 +7806,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7382,7 +7822,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -7395,8 +7835,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7405,19 +7845,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB17_3 +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -7429,7 +7869,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7440,9 +7880,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7456,7 +7896,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -7469,8 +7909,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7480,14 +7920,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB17_3 +; GFX6-NEXT: s_cbranch_execnz .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v7 ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -7495,21 +7935,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; misc ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -7526,22 +7966,22 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -7555,21 +7995,21 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_cbranch_execnz .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -7587,26 +8027,22 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -7615,33 +8051,29 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 @@ -7649,33 +8081,29 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -7683,32 +8111,28 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 @@ -7716,32 +8140,28 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v0 @@ -7749,32 +8169,28 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v0 @@ -7783,22 +8199,22 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } - +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index e0e6ccd72caeaa..2791162396a910 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -13,28 +13,28 @@ ; float ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -57,10 +57,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -68,35 +68,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -104,31 +96,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -137,31 +125,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -170,69 +154,61 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -255,10 +231,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -266,65 +242,53 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -332,31 +296,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -364,51 +324,43 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -441,7 +393,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -502,7 +454,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -532,7 +484,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, exec_lo @@ -561,7 +513,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -620,7 +572,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 @@ -680,7 +632,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 @@ -740,7 +692,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[6:7], exec @@ -766,7 +718,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[6:7], exec @@ -793,861 +745,1549 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret float %result } -define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] -; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB5_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v6 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB5_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, v3 +; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_mov_b32_e32 v8, v1 +; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB7_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX908-NEXT: v_mov_b32_e32 v14, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v13, v0 +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB7_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v13, v0 +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB7_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-NEXT: v_mov_b32_e32 v1, v6 +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, v6 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v10, v1 ; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 -; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX908-NEXT: v_mov_b32_e32 v14, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB5_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 -; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX8-NEXT: v_mov_b32_e32 v14, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB5_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-NEXT: v_mov_b32_e32 v1, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %result } @@ -1655,27 +2295,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a ; half ; -------------------------------------------------------------------- -define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1697,26 +2336,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s5 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1733,29 +2372,28 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v4, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1778,289 +2416,264 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v4, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s9 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s9 +; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2082,25 +2695,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s5 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2117,28 +2730,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2161,261 +2773,237 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s9 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2431,7 +3019,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -2444,14 +3032,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -2466,7 +3054,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -2481,8 +3069,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2491,13 +3079,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB8_3 +; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2508,7 +3096,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -2520,14 +3108,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2537,7 +3125,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -2551,8 +3139,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2560,13 +3148,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB8_3 +; GFX940-NEXT: s_cbranch_execnz .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2579,7 +3167,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -2592,14 +3180,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -2614,7 +3202,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -2629,8 +3217,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2640,13 +3228,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB8_3 +; GFX11-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2657,7 +3245,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -2669,13 +3257,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -2686,7 +3274,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -2700,8 +3288,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2711,13 +3299,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB8_3 +; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2728,7 +3316,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -2740,14 +3328,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2756,7 +3344,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -2769,8 +3357,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2778,13 +3366,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2795,7 +3383,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -2807,14 +3395,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2824,7 +3412,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -2837,8 +3425,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2846,13 +3434,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB8_3 +; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -2863,7 +3451,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -2875,14 +3463,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2893,7 +3481,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -2906,8 +3494,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2915,13 +3503,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB8_3 +; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -2931,7 +3519,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -2942,15 +3530,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -2962,7 +3550,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -2975,8 +3563,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2984,14 +3572,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB8_3 +; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -3001,7 +3589,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -3012,15 +3600,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -3032,7 +3620,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -3045,8 +3633,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3054,7 +3642,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB8_3 +; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -3062,7 +3650,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -3070,27 +3658,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; bfloat ; -------------------------------------------------------------------- -define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3119,27 +3706,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s5 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3161,29 +3748,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v4, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3213,33 +3799,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v4, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 @@ -3247,133 +3829,121 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s9 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s9 +; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v3, v3, v5 @@ -3383,141 +3953,132 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3546,26 +4107,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s5 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3587,28 +4148,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3638,32 +4198,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 @@ -3671,130 +4227,118 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s9 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 @@ -3804,117 +4348,109 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3930,7 +4466,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -3943,14 +4479,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -3972,7 +4508,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3987,8 +4523,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -3997,13 +4533,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB11_3 +; GFX12-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4014,7 +4550,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -4026,15 +4562,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_mov_b64 s[8:9], exec @@ -4049,7 +4585,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -4063,8 +4599,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4072,13 +4608,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB11_3 +; GFX940-NEXT: s_cbranch_execnz .LBB15_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4091,7 +4627,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -4104,15 +4640,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -4134,7 +4670,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -4149,8 +4685,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4160,14 +4696,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4178,7 +4714,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -4190,13 +4726,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -4211,7 +4747,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -4225,8 +4761,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4236,13 +4772,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB11_3 +; GFX10-NEXT: s_cbranch_execnz .LBB15_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4253,7 +4789,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -4265,15 +4801,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v4, v4, v11 @@ -4286,7 +4822,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -4299,8 +4835,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4308,13 +4844,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4325,7 +4861,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -4337,15 +4873,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v4, v4, v10 @@ -4359,7 +4895,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -4372,8 +4908,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4381,13 +4917,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB11_3 +; GFX908-NEXT: s_cbranch_execnz .LBB15_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -4398,7 +4934,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -4410,14 +4946,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v4, v4, v10 @@ -4433,7 +4969,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -4446,8 +4982,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4455,13 +4991,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB11_3 +; GFX8-NEXT: s_cbranch_execnz .LBB15_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4471,7 +5007,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -4482,15 +5018,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4503,7 +5039,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -4516,8 +5052,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4525,14 +5061,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB11_3 +; GFX7-NEXT: s_cbranch_execnz .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4542,7 +5078,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -4553,15 +5089,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4574,7 +5110,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -4587,8 +5123,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4596,7 +5132,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB11_3 +; GFX6-NEXT: s_cbranch_execnz .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -4604,7 +5140,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -4612,22 +5148,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -4644,22 +5180,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -4674,22 +5210,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -4707,26 +5243,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -4735,65 +5267,57 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -4801,33 +5325,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -4838,38 +5358,34 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4885,41 +5401,37 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4936,40 +5448,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 @@ -4985,21 +5497,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5014,21 +5526,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5045,25 +5557,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 ; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5071,97 +5579,85 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -5171,39 +5667,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5219,41 +5711,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5270,27 +5758,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5299,7 +5787,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -5313,14 +5801,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -5329,7 +5817,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -5344,8 +5832,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5354,18 +5842,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB14_3 +; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -5378,21 +5866,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -5406,8 +5894,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5415,19 +5903,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB14_3 +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -5441,14 +5929,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -5457,7 +5945,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -5472,8 +5960,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5483,19 +5971,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB14_3 +; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -5508,13 +5996,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -5522,7 +6010,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -5536,8 +6024,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5547,18 +6035,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB14_3 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -5571,20 +6059,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -5597,8 +6085,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5606,18 +6094,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -5630,21 +6118,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX908-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -5657,8 +6145,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5666,18 +6154,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB14_3 +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -5690,15 +6178,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 -; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 @@ -5708,7 +6196,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -5721,8 +6209,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5730,18 +6218,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB14_3 +; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -5753,7 +6241,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5765,9 +6253,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[12:13], exec @@ -5783,7 +6271,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -5796,8 +6284,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5807,19 +6295,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB14_3 +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -5831,7 +6319,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5843,9 +6331,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec @@ -5862,7 +6350,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -5875,8 +6363,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5886,7 +6374,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB14_3 +; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v4 @@ -5894,7 +6382,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -5902,23 +6390,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 @@ -5951,25 +6439,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_addk_i32 s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v0 @@ -5996,16 +6484,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cbranch_execnz .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -6014,7 +6502,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v0 @@ -6048,28 +6536,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -6085,42 +6569,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -6135,40 +6615,36 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -6183,39 +6659,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6232,44 +6704,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6283,39 +6751,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6330,39 +6794,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6393,24 +6857,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_addk_i32 s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6437,23 +6901,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cbranch_execnz .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6485,27 +6949,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6520,42 +6980,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6569,40 +7025,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6616,39 +7068,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6664,45 +7112,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6716,39 +7160,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6763,26 +7203,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6791,7 +7231,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -6805,15 +7245,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -6837,7 +7277,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -6852,8 +7292,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -6862,18 +7302,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB17_3 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -6886,7 +7326,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 @@ -6894,9 +7334,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_movk_i32 s10, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX940-NEXT: v_min_f32_e32 v4, v4, v9 @@ -6917,7 +7357,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -6931,8 +7371,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -6940,19 +7380,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB17_3 +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -6966,16 +7406,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -6999,7 +7439,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7014,8 +7454,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7025,20 +7465,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB17_3 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -7051,14 +7491,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 @@ -7079,7 +7519,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -7093,8 +7533,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7104,18 +7544,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB17_3 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -7128,7 +7568,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 @@ -7136,9 +7576,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 -; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX90A-NEXT: v_min_f32_e32 v4, v4, v9 @@ -7157,7 +7597,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -7170,8 +7610,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -7179,18 +7619,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -7203,7 +7643,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 @@ -7211,9 +7651,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_movk_i32 s14, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 -; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX908-NEXT: v_min_f32_e32 v4, v4, v8 @@ -7233,7 +7673,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -7246,8 +7686,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7255,18 +7695,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB17_3 +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -7279,15 +7719,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v8 @@ -7310,7 +7750,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -7323,8 +7763,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7332,18 +7772,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB17_3 +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -7355,7 +7795,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7366,9 +7806,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7382,7 +7822,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -7395,8 +7835,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7405,19 +7845,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB17_3 +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -7429,7 +7869,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7440,9 +7880,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7456,7 +7896,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -7469,8 +7909,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7480,14 +7920,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB17_3 +; GFX6-NEXT: s_cbranch_execnz .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v7 ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -7495,21 +7935,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; misc ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -7526,22 +7966,22 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -7555,21 +7995,21 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_cbranch_execnz .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -7587,26 +8027,22 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -7615,33 +8051,29 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 @@ -7649,33 +8081,29 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -7683,32 +8111,28 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 @@ -7716,32 +8140,28 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v0 @@ -7749,32 +8169,28 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v0 @@ -7783,22 +8199,22 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } - +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll index 08a997530d3c94..16f29cc329976c 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) { ; GISEL-LABEL: buffer_ptr_vector_ops: ; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GISEL-NEXT: v_mov_b32_e32 v8, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -25,7 +25,7 @@ define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) { ; ; SDAG-LABEL: buffer_ptr_vector_ops: ; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -60,16 +60,16 @@ main_body: define amdgpu_kernel void @buffer_structs(%fat_buffer_struct %arg, ptr addrspace(1) %dest) { ; GISEL-LABEL: buffer_structs: ; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_ashr_i32 s1, s0, 31 -; GISEL-NEXT: v_mov_b32_e32 v4, s0 -; GISEL-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 +; GISEL-NEXT: s_ashr_i32 s3, s2, 31 +; GISEL-NEXT: s_lshl_b64 s[0:1], s[2:3], 5 ; GISEL-NEXT: s_add_u32 s0, s8, s0 ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v4, s2 ; GISEL-NEXT: s_addc_u32 s1, s9, s1 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -81,15 +81,15 @@ define amdgpu_kernel void @buffer_structs(%fat_buffer_struct %arg, ptr addrspace ; ; SDAG-LABEL: buffer_structs: ; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: s_ashr_i32 s1, s0, 31 -; SDAG-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 +; SDAG-NEXT: s_ashr_i32 s3, s2, 31 +; SDAG-NEXT: s_lshl_b64 s[0:1], s[2:3], 5 ; SDAG-NEXT: s_add_u32 s0, s8, s0 +; SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-NEXT: s_addc_u32 s1, s9, s1 ; SDAG-NEXT: buffer_store_dword v0, v0, s[4:7], 0 offen ; SDAG-NEXT: global_store_dword v4, v0, s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 8293280609517a..b26d15ed3a1c8a 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector2: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -19,7 +19,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -30,7 +30,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector2: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -40,7 +40,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector2: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -52,7 +52,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector2: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -67,7 +67,7 @@ entry: define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector4: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -80,7 +80,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: v_mov_b32_e32 v2, 7 @@ -93,7 +93,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector4: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -105,7 +105,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector4: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -119,7 +119,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector4: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -136,7 +136,7 @@ entry: define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector_v2i16: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x60005 @@ -146,7 +146,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector_v2i16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x60005 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -156,7 +156,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector_v2i16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -165,7 +165,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -176,7 +176,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector_v2i16: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -190,8 +190,8 @@ entry: define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: build_vector_v2i16_trunc: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -201,10 +201,10 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; ; GFX8-LABEL: build_vector_v2i16_trunc: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s2, s4, 16 +; GFX8-NEXT: s_lshr_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s2, 0x50000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -215,11 +215,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX10-LABEL: build_vector_v2i16_trunc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -228,11 +228,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX11-LABEL: build_vector_v2i16_trunc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_hl_b32_b16 s2, s4, 5 +; GFX11-NEXT: s_pack_hl_b32_b16 s2, s2, 5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -242,14 +242,14 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; ; GFX940-LABEL: build_vector_v2i16_trunc: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshr_b32 s2, s4, 16 -; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, 5 -; GFX940-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, 5 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm %srl = lshr i32 %a, 16 %trunc = trunc i32 %srl to i16 @@ -262,7 +262,7 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) { ; GFX6-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -277,7 +277,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -290,7 +290,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 @@ -302,7 +302,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX11-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 @@ -316,7 +316,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s3, s3, 16 diff --git a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll index 5d1647782b0d8f..f1992d71eb1de8 100644 --- a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll @@ -50,7 +50,7 @@ define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 { ; GCN-NEXT: v_and_b32_e32 [[TMP:v[0-9]+]], 0x3ff, v31 ; GCN-NEXT: v_add_i32_e32 v0, vcc, [[TMP]], v0 ; GCN-NEXT: s_setpc_b64 -define hidden i32 @use_workitem_id_x(i32 %arg0) #3 { +define hidden i32 @use_workitem_id_x(i32 %arg0) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %op = add i32 %id, %arg0 ret i32 %op @@ -64,7 +64,7 @@ define hidden i32 @use_workitem_id_x(i32 %arg0) #3 { ; GCN: v_mov_b32_e32 v0, 9 ; GCN: s_swappc_b64 ; GCN: v_add_f32_e32 -define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #3 { +define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #0 { %val = call float @use_workitem_id_x(i32 9) %op = fadd float %val, 1.0 store volatile float %op, ptr addrspace(1) undef @@ -112,4 +112,3 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind noinline } attributes #1 = { alwaysinline nounwind } attributes #2 = { nounwind readnone speculatable } -attributes #3 = { nounwind noinline "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index 6af45035d394f8..ed418070ecb506 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -273,8 +273,8 @@ entry: ret void } -attributes #0 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #1 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind noinline norecurse } +attributes #1 = { nounwind noinline norecurse } attributes #2 = { nounwind noinline } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll index 06dec7e792389f..c62a0824591050 100644 --- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll @@ -10,9 +10,9 @@ declare hidden void @callee() #0 define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_size !0 { ; CHECK-LABEL: known_x_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 @@ -30,9 +30,9 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_size !1 { ; CHECK-LABEL: known_y_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -49,9 +49,9 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_size !2 { ; CHECK-LABEL: known_z_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -68,9 +68,9 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_size !3 { ; CHECK-LABEL: known_yz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -87,9 +87,9 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_size !4 { ; CHECK-LABEL: known_xz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -107,9 +107,9 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_size !5 { ; CHECK-LABEL: known_xyz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, 0 ; CHECK-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index 8ef2d89e76d4e1..b711542be5a7fc 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -1,7 +1,5 @@ -; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri < %s | llc -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 < %s | llc -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s - -target triple = "amdgcn-amd-amdhsa" +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}use_dispatch_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll index b52e7918b27ab1..1d2523d364e550 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -1,7 +1,5 @@ -; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri < %s | llc -mcpu=gfx90a -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7,UNPACKED-TID %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx90a -mattr=-xnack < %s | llc -mcpu=gfx90a -mattr=-xnack -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,PACKED-TID %s - -target triple = "amdgcn-amd-amdhsa" +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7,UNPACKED-TID %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,PACKED-TID %s ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 9792c9dabac2f6..5e6f377da28e15 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -1,6 +1,4 @@ -; RUN: opt -mcpu=kaveri -passes=amdgpu-attributor < %s | llc -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s - -target triple = "amdgcn-amd-amdhsa" +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 231d3d97c8f4f3..15ebdd70ae8818 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -8,7 +8,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; SI-LABEL: kernel: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -18,7 +18,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; VI-LABEL: kernel: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -28,7 +28,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; GFX11-LABEL: kernel: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -115,32 +115,21 @@ define amdgpu_kernel void @call_coldcc() #0 { ; SI-LABEL: call_coldcc: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_mov_b32 s23, 0xe8f000 -; SI-NEXT: s_add_u32 s20, s20, s9 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_mov_b32 s14, s8 -; SI-NEXT: s_mov_b64 s[10:11], s[4:5] -; SI-NEXT: s_add_u32 s8, s2, 36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; SI-NEXT: s_addc_u32 s9, s3, 0 -; SI-NEXT: s_getpc_b64 s[2:3] -; SI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v31, v0, v2 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_getpc_b64 s[0:1] +; SI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b64 s[0:1], s[20:21] -; SI-NEXT: s_mov_b64 s[2:3], s[22:23] +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -152,49 +141,31 @@ define amdgpu_kernel void @call_coldcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s9 +; VI-NEXT: s_add_u32 s88, s88, s1 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_mov_b32 s14, s8 -; VI-NEXT: s_add_u32 s8, s2, 36 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: s_getpc_b64 s[2:3] -; VI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b64 s[10:11], s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_mov_b64 s[4:5], s[0:1] +; VI-NEXT: s_getpc_b64 s[0:1] +; VI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] -; VI-NEXT: v_or_b32_e32 v31, v0, v2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: call_coldcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_u32 s8, s2, 36 -; GFX11-NEXT: s_addc_u32 s9, s3, 0 -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s13, s14 -; GFX11-NEXT: s_mov_b32 s14, s15 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @coldcc(float 1.0) @@ -206,32 +177,21 @@ define amdgpu_kernel void @call_fastcc() #0 { ; SI-LABEL: call_fastcc: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_mov_b32 s23, 0xe8f000 -; SI-NEXT: s_add_u32 s20, s20, s9 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_mov_b32 s14, s8 -; SI-NEXT: s_mov_b64 s[10:11], s[4:5] -; SI-NEXT: s_add_u32 s8, s2, 36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; SI-NEXT: s_addc_u32 s9, s3, 0 -; SI-NEXT: s_getpc_b64 s[2:3] -; SI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v31, v0, v2 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_getpc_b64 s[0:1] +; SI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b64 s[0:1], s[20:21] -; SI-NEXT: s_mov_b64 s[2:3], s[22:23] +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -243,49 +203,31 @@ define amdgpu_kernel void @call_fastcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s9 +; VI-NEXT: s_add_u32 s88, s88, s1 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_mov_b32 s14, s8 -; VI-NEXT: s_add_u32 s8, s2, 36 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: s_getpc_b64 s[2:3] -; VI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b64 s[10:11], s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_mov_b64 s[4:5], s[0:1] +; VI-NEXT: s_getpc_b64 s[0:1] +; VI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] -; VI-NEXT: v_or_b32_e32 v31, v0, v2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: call_fastcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_u32 s8, s2, 36 -; GFX11-NEXT: s_addc_u32 s9, s3, 0 -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s13, s14 -; GFX11-NEXT: s_mov_b32 s14, s15 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @fastcc(float 1.0) @@ -1012,7 +954,7 @@ define amdgpu_ps i16 @ret_ps_mesa_i16() { define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; SI-LABEL: amd_kernel_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s0, s0, s0 @@ -1023,7 +965,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; VI-LABEL: amd_kernel_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s0, s0, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1032,7 +974,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; GFX11-LABEL: amd_kernel_i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s0, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1050,7 +992,7 @@ entry: define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; SI-LABEL: amd_kernel_v2i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1068,7 +1010,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v2i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_i32 s0, s0, s0 @@ -1082,7 +1024,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v2i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: v_add_nc_u16 v1, s0, s0 @@ -1107,7 +1049,7 @@ entry: define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; SI-LABEL: amd_kernel_v4i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1135,7 +1077,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v4i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 24 ; VI-NEXT: s_lshr_b32 s2, s0, 16 @@ -1157,7 +1099,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v4i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 @@ -1194,7 +1136,7 @@ entry: define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; SI-LABEL: amd_kernel_v3i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 2 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1218,7 +1160,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v3i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1238,7 +1180,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v3i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1270,7 +1212,7 @@ entry: define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; SI-LABEL: amd_kernel_v5i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 4 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1303,7 +1245,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v5i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s0, 24 ; VI-NEXT: s_lshr_b32 s3, s0, 16 @@ -1331,7 +1273,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v5i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16 @@ -1370,7 +1312,7 @@ entry: define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; SI-LABEL: amd_kernel_v8i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1415,7 +1357,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v8i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s1, 24 ; VI-NEXT: s_lshr_b32 s3, s1, 16 @@ -1450,7 +1392,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v8i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1 @@ -1503,7 +1445,7 @@ entry: define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; SI-LABEL: amd_kernel_v16i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1582,7 +1524,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v16i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 24 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -1643,7 +1585,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v16i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s10, s3, 16 ; GFX11-NEXT: s_lshr_b32 s11, s3, 24 @@ -1724,7 +1666,7 @@ entry: define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; SI-LABEL: amd_kernel_v32i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s9, 0 ; SI-NEXT: s_mov_b32 s8, 16 ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1874,7 +1816,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v32i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v10, 0 ; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1990,7 +1932,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v32i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v3, 8, s2 ; GFX11-NEXT: v_lshrrev_b16 v7, 8, s3 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index f248708d16ea2a..a0499ef6d0f6ae 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -18,8 +18,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; CISI-LABEL: sadd64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -34,8 +34,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; VI-LABEL: sadd64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_add_u32 s0, s6, s0 @@ -48,12 +48,12 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX9-LABEL: sadd64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 +; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -62,12 +62,12 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1010-LABEL: sadd64rr: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_add_u32 s0, s6, s0 -; GFX1010-NEXT: s_addc_u32 s1, s7, s1 +; GFX1010-NEXT: s_add_u32 s0, s6, s2 +; GFX1010-NEXT: s_addc_u32 s1, s7, s3 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -76,8 +76,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W32-LABEL: sadd64rr: ; GFX1030W32: ; %bb.0: ; %entry ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s0, s6, s0 @@ -90,8 +90,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W64-LABEL: sadd64rr: ; GFX1030W64: ; %bb.0: ; %entry ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s0, s6, s0 @@ -104,8 +104,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: sadd64rr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s6, s0 ; GFX11-NEXT: s_addc_u32 s1, s7, s1 @@ -129,7 +129,7 @@ entry: define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: sadd64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -144,7 +144,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: sadd64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s2, 0x56789876 @@ -157,7 +157,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: sadd64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -169,7 +169,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: sadd64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -181,7 +181,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: sadd64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -193,7 +193,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: sadd64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -205,7 +205,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: sadd64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876 ; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234 @@ -229,7 +229,7 @@ entry: define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: vadd64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: vadd64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 @@ -255,7 +255,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: vadd64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -266,7 +266,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: vadd64rr: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: vadd64rr: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_add_co_u32 v0, s2, s2, v0 @@ -286,7 +286,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: vadd64rr: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s2, v0 @@ -296,12 +296,11 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: vadd64rr: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -323,7 +322,7 @@ entry: define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; CISI-LABEL: vadd64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CISI-NEXT: v_add_i32_e32 v0, vcc, 0x56789876, v0 ; CISI-NEXT: v_mov_b32_e32 v1, 0x1234 ; CISI-NEXT: s_mov_b32 s3, 0xf000 @@ -335,7 +334,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; VI-LABEL: vadd64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x56789876, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0x1234 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -347,7 +346,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX9-LABEL: vadd64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x56789876, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -358,8 +357,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1010-LABEL: vadd64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1010-NEXT: s_mov_b32 null, 0 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2 @@ -369,7 +367,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1030W32-LABEL: vadd64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2 @@ -379,7 +377,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1030W64-LABEL: vadd64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3] @@ -389,11 +387,9 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX11-LABEL: vadd64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -415,8 +411,8 @@ entry: define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: suaddo32: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -427,23 +423,23 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: suaddo32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_i32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: suaddo32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_add_i32 s0, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm @@ -451,11 +447,11 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: suaddo32: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_add_i32 s0, s0, s1 +; GFX1010-NEXT: s_add_i32 s0, s2, s3 ; GFX1010-NEXT: v_mov_b32_e32 v1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: s_endpgm @@ -463,37 +459,37 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: suaddo32: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: s_add_i32 s0, s0, s1 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0 -; GFX1030W32-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1030W32-NEXT: s_add_i32 s2, s2, s3 +; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2 +; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W32-NEXT: s_endpgm ; ; GFX1030W64-LABEL: suaddo32: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: s_add_i32 s0, s0, s1 -; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX1030W64-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1030W64-NEXT: s_add_i32 s2, s2, s3 +; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2 +; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W64-NEXT: s_endpgm ; ; GFX11-LABEL: suaddo32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_add_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -517,28 +513,28 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: uaddo32_vcc_user: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CISI-NEXT: s_mov_b32 s3, 0xf000 +; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v0, s13 -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 +; CISI-NEXT: s_mov_b32 s0, s4 +; CISI-NEXT: v_mov_b32_e32 v0, s9 +; CISI-NEXT: s_mov_b32 s1, s5 +; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; CISI-NEXT: s_mov_b32 s4, s6 +; CISI-NEXT: s_mov_b32 s5, s7 +; CISI-NEXT: s_mov_b32 s6, s2 +; CISI-NEXT: s_mov_b32 s7, s3 ; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CISI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; CISI-NEXT: buffer_store_byte v1, off, s[0:3], 0 +; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CISI-NEXT: buffer_store_byte v1, off, s[4:7], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: uaddo32_vcc_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -553,12 +549,12 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: uaddo32_vcc_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -567,11 +563,11 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1010-LABEL: uaddo32_vcc_user: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_add_co_u32 v1, s0, s0, s1 +; GFX1010-NEXT: v_add_co_u32 v1, s0, s2, s3 ; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: global_store_byte v0, v2, s[6:7] @@ -580,8 +576,8 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W32-LABEL: uaddo32_vcc_user: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_add_co_u32 v1, s4, s4, s5 @@ -593,8 +589,8 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W64-LABEL: uaddo32_vcc_user: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_add_co_u32 v1, s[4:5], s4, s5 @@ -606,8 +602,8 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: uaddo32_vcc_user: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v1, s4, s4, s5 @@ -635,7 +631,7 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; CISI-LABEL: suaddo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s11, 0xf000 ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -659,7 +655,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: suaddo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -679,7 +675,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: suaddo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s6, s4, s6 @@ -696,7 +692,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1010-LABEL: suaddo64: ; GFX1010: ; %bb.0: -; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s6, s4, s6 @@ -711,7 +707,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W32-LABEL: suaddo64: ; GFX1030W32: ; %bb.0: -; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s6, s4, s6 @@ -726,7 +722,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W64-LABEL: suaddo64: ; GFX1030W64: ; %bb.0: -; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s6, s4, s6 @@ -741,7 +737,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX11-LABEL: suaddo64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s6, s4, s6 ; GFX11-NEXT: s_addc_u32 s7, s5, s7 @@ -772,31 +768,31 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 { ; CISI-LABEL: vuaddo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CISI-NEXT: s_mov_b32 s3, 0xf000 +; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v1, s13 -; CISI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 +; CISI-NEXT: s_mov_b32 s0, s4 +; CISI-NEXT: v_mov_b32_e32 v1, s9 +; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 ; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 -; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; CISI-NEXT: s_mov_b32 s1, s5 +; CISI-NEXT: s_mov_b32 s4, s6 +; CISI-NEXT: s_mov_b32 s5, s7 +; CISI-NEXT: s_mov_b32 s6, s2 +; CISI-NEXT: s_mov_b32 s7, s3 +; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CISI-NEXT: s_waitcnt expcnt(0) ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; CISI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: vuaddo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v6, s1 @@ -813,14 +809,14 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: vuaddo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v2, v0, s[6:7] @@ -829,13 +825,13 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: vuaddo64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_add_co_u32 v0, s2, s0, v0 -; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s1, 0, s2 -; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX1010-NEXT: v_add_co_u32 v0, s0, s2, v0 +; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] @@ -844,8 +840,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: vuaddo64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_add_co_u32 v0, s6, s4, v0 @@ -859,8 +855,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-LABEL: vuaddo64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_add_co_u32 v0, s[6:7], s4, v0 @@ -874,15 +870,13 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-LABEL: vuaddo64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s6, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11-NEXT: s_clause 0x1 @@ -909,8 +903,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; CISI-LABEL: ssub64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -925,8 +919,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; VI-LABEL: ssub64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_sub_u32 s0, s6, s0 @@ -939,12 +933,12 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX9-LABEL: ssub64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s6, s0 -; GFX9-NEXT: s_subb_u32 s1, s7, s1 +; GFX9-NEXT: s_sub_u32 s0, s6, s2 +; GFX9-NEXT: s_subb_u32 s1, s7, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -953,12 +947,12 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1010-LABEL: ssub64rr: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_sub_u32 s0, s6, s0 -; GFX1010-NEXT: s_subb_u32 s1, s7, s1 +; GFX1010-NEXT: s_sub_u32 s0, s6, s2 +; GFX1010-NEXT: s_subb_u32 s1, s7, s3 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -967,8 +961,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W32-LABEL: ssub64rr: ; GFX1030W32: ; %bb.0: ; %entry ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s0, s6, s0 @@ -981,8 +975,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W64-LABEL: ssub64rr: ; GFX1030W64: ; %bb.0: ; %entry ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s0, s6, s0 @@ -995,8 +989,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: ssub64rr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s0, s6, s0 ; GFX11-NEXT: s_subb_u32 s1, s7, s1 @@ -1020,7 +1014,7 @@ entry: define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: ssub64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1035,7 +1029,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: ssub64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, 0x56789876, s2 @@ -1048,7 +1042,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: ssub64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1060,7 +1054,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: ssub64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1072,7 +1066,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: ssub64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1084,7 +1078,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: ssub64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1096,7 +1090,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: ssub64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2 ; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3 @@ -1120,7 +1114,7 @@ entry: define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: vsub64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1134,7 +1128,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: vsub64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_sub_u32_e32 v3, vcc, s2, v0 @@ -1146,7 +1140,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: vsub64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1157,7 +1151,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: vsub64rr: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0 @@ -1167,7 +1161,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: vsub64rr: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, s2, v0 @@ -1177,7 +1171,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: vsub64rr: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s2, v0 @@ -1187,12 +1181,11 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: vsub64rr: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_u32 v0, s2, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1214,7 +1207,7 @@ entry: define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; CISI-LABEL: vsub64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CISI-NEXT: v_sub_i32_e32 v0, vcc, 0x56789876, v0 ; CISI-NEXT: v_mov_b32_e32 v1, 0x1234 ; CISI-NEXT: s_mov_b32 s3, 0xf000 @@ -1226,7 +1219,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; VI-LABEL: vsub64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x56789876, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0x1234 ; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -1238,7 +1231,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX9-LABEL: vsub64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0x56789876, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1249,8 +1242,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1010-LABEL: vsub64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1010-NEXT: s_mov_b32 null, 0 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1010-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, 0x1234, 0, s2 @@ -1260,7 +1252,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1030W32-LABEL: vsub64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 @@ -1270,7 +1262,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1030W64-LABEL: vsub64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[2:3], 0x56789876, v0 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3] @@ -1280,11 +1272,9 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX11-LABEL: vsub64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -1307,8 +1297,8 @@ entry: define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: susubo32: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1319,23 +1309,23 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: susubo32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sub_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_sub_i32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: susubo32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm @@ -1343,11 +1333,11 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: susubo32: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_sub_i32 s0, s0, s1 +; GFX1010-NEXT: s_sub_i32 s0, s2, s3 ; GFX1010-NEXT: v_mov_b32_e32 v1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: s_endpgm @@ -1355,37 +1345,37 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: susubo32: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: s_sub_i32 s0, s0, s1 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0 -; GFX1030W32-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1030W32-NEXT: s_sub_i32 s2, s2, s3 +; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2 +; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W32-NEXT: s_endpgm ; ; GFX1030W64-LABEL: susubo32: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: s_sub_i32 s0, s0, s1 -; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX1030W64-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1030W64-NEXT: s_sub_i32 s2, s2, s3 +; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2 +; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W64-NEXT: s_endpgm ; ; GFX11-LABEL: susubo32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s0, s0, s1 +; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1409,28 +1399,28 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: usubo32_vcc_user: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CISI-NEXT: s_mov_b32 s3, 0xf000 +; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v0, s13 -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 +; CISI-NEXT: s_mov_b32 s0, s4 +; CISI-NEXT: v_mov_b32_e32 v0, s9 +; CISI-NEXT: s_mov_b32 s1, s5 +; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; CISI-NEXT: s_mov_b32 s4, s6 +; CISI-NEXT: s_mov_b32 s5, s7 +; CISI-NEXT: s_mov_b32 s6, s2 +; CISI-NEXT: s_mov_b32 s7, s3 ; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CISI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; CISI-NEXT: buffer_store_byte v1, off, s[0:3], 0 +; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CISI-NEXT: buffer_store_byte v1, off, s[4:7], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: usubo32_vcc_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1445,12 +1435,12 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: usubo32_vcc_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -1459,11 +1449,11 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1010-LABEL: usubo32_vcc_user: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_sub_co_u32 v1, s0, s0, s1 +; GFX1010-NEXT: v_sub_co_u32 v1, s0, s2, s3 ; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: global_store_byte v0, v2, s[6:7] @@ -1472,8 +1462,8 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W32-LABEL: usubo32_vcc_user: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_sub_co_u32 v1, s4, s4, s5 @@ -1485,8 +1475,8 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W64-LABEL: usubo32_vcc_user: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], s4, s5 @@ -1498,8 +1488,8 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: usubo32_vcc_user: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_sub_co_u32 v1, s4, s4, s5 @@ -1527,7 +1517,7 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; CISI-LABEL: susubo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s11, 0xf000 ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1551,7 +1541,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: susubo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, s4, s6 @@ -1571,7 +1561,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: susubo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_u32 s6, s4, s6 @@ -1588,7 +1578,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1010-LABEL: susubo64: ; GFX1010: ; %bb.0: -; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s6, s4, s6 @@ -1603,7 +1593,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W32-LABEL: susubo64: ; GFX1030W32: ; %bb.0: -; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s6, s4, s6 @@ -1618,7 +1608,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W64-LABEL: susubo64: ; GFX1030W64: ; %bb.0: -; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s6, s4, s6 @@ -1633,7 +1623,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX11-LABEL: susubo64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s6, s4, s6 ; GFX11-NEXT: s_subb_u32 s7, s5, s7 @@ -1664,31 +1654,31 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 { ; CISI-LABEL: vusubo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CISI-NEXT: s_mov_b32 s3, 0xf000 +; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v1, s13 -; CISI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 +; CISI-NEXT: s_mov_b32 s0, s4 +; CISI-NEXT: v_mov_b32_e32 v1, s9 +; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 -; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; CISI-NEXT: s_mov_b32 s1, s5 +; CISI-NEXT: s_mov_b32 s4, s6 +; CISI-NEXT: s_mov_b32 s5, s7 +; CISI-NEXT: s_mov_b32 s6, s2 +; CISI-NEXT: s_mov_b32 s7, s3 +; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CISI-NEXT: s_waitcnt expcnt(0) ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; CISI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: vusubo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v6, s1 @@ -1705,14 +1695,14 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: vusubo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v2, v0, s[6:7] @@ -1721,13 +1711,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: vusubo64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_sub_co_u32 v0, s2, s0, v0 -; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s1, 0, s2 -; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX1010-NEXT: v_sub_co_u32 v0, s0, s2, v0 +; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] @@ -1736,8 +1726,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: vusubo64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_sub_co_u32 v0, s6, s4, v0 @@ -1751,8 +1741,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-LABEL: vusubo64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[6:7], s4, v0 @@ -1766,15 +1756,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-LABEL: vusubo64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_u32 v0, s6, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11-NEXT: s_clause 0x1 @@ -1804,8 +1792,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-LABEL: sudiv64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; CISI-NEXT: s_waitcnt lgkmcnt(0) ; CISI-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] ; CISI-NEXT: s_mov_b32 s0, 0 @@ -1955,8 +1943,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; VI-LABEL: sudiv64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] ; VI-NEXT: s_mov_b32 s0, 0 @@ -2112,18 +2100,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GFX9-LABEL: sudiv64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b64 s[0:1], s[6:7], s[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_sub_u32 s0, 0, s8 -; GFX9-NEXT: s_subb_u32 s1, 0, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: s_sub_u32 s0, 0, s2 +; GFX9-NEXT: s_subb_u32 s1, 0, s3 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2196,24 +2184,24 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: s_mul_i32 s0, s7, s0 ; GFX9-NEXT: s_add_u32 s11, s1, s0 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_i32 s0, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s11 +; GFX9-NEXT: s_mul_i32 s0, s2, s10 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s11 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s9, s11 +; GFX9-NEXT: s_mul_i32 s1, s3, s11 ; GFX9-NEXT: s_add_i32 s12, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s8, s11 +; GFX9-NEXT: s_mul_i32 s1, s2, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_sub_i32 s0, s7, s12 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s13, s0, s9 -; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s8, v0 +; GFX9-NEXT: s_subb_u32 s13, s0, s3 +; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s2, v0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s13, s13, 0 -; GFX9-NEXT: s_cmp_ge_u32 s13, s9 +; GFX9-NEXT: s_cmp_ge_u32 s13, s3 ; GFX9-NEXT: s_cselect_b32 s14, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v1 -; GFX9-NEXT: s_cmp_eq_u32 s13, s9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 +; GFX9-NEXT: s_cmp_eq_u32 s13, s3 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -2231,10 +2219,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: s_subb_u32 s0, s7, s12 -; GFX9-NEXT: s_cmp_ge_u32 s0, s9 +; GFX9-NEXT: s_cmp_ge_u32 s0, s3 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX9-NEXT: s_cmp_eq_u32 s0, s9 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX9-NEXT: s_cmp_eq_u32 s0, s3 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2246,27 +2234,27 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 ; GFX9-NEXT: .LBB16_2: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_sub_i32 s0, 0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s0, 0, s2 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s0, s2, s0 -; GFX9-NEXT: s_add_i32 s2, s2, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s2 -; GFX9-NEXT: s_mul_i32 s3, s0, s8 -; GFX9-NEXT: s_sub_i32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_sub_i32 s6, s3, s8 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s3, s0 +; GFX9-NEXT: s_add_i32 s3, s3, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s3 +; GFX9-NEXT: s_mul_i32 s7, s0, s2 +; GFX9-NEXT: s_sub_i32 s6, s6, s7 +; GFX9-NEXT: s_add_i32 s3, s0, 1 +; GFX9-NEXT: s_sub_i32 s7, s6, s2 +; GFX9-NEXT: s_cmp_ge_u32 s6, s2 +; GFX9-NEXT: s_cselect_b32 s0, s3, s0 +; GFX9-NEXT: s_cselect_b32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s3, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s2 +; GFX9-NEXT: s_cselect_b32 s0, s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: .LBB16_3: @@ -2280,18 +2268,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-LABEL: sudiv64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_or_b64 s[2:3], s[6:7], s[8:9] -; GFX1010-NEXT: s_mov_b32 s2, 0 -; GFX1010-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1010-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] +; GFX1010-NEXT: s_mov_b32 s8, 0 +; GFX1010-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX1010-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1010-NEXT: ; %bb.1: -; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX1010-NEXT: s_sub_u32 s3, 0, s8 -; GFX1010-NEXT: s_subb_u32 s10, 0, s9 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX1010-NEXT: s_sub_u32 s9, 0, s2 +; GFX1010-NEXT: s_subb_u32 s10, 0, s3 ; GFX1010-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1010-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2302,11 +2290,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1010-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s11, s3, s0 -; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s1 +; GFX1010-NEXT: s_mul_i32 s11, s9, s0 +; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s1 ; GFX1010-NEXT: s_mul_i32 s12, s10, s1 ; GFX1010-NEXT: s_add_i32 s11, s13, s11 -; GFX1010-NEXT: s_mul_i32 s14, s3, s1 +; GFX1010-NEXT: s_mul_i32 s14, s9, s1 ; GFX1010-NEXT: s_add_i32 s11, s11, s12 ; GFX1010-NEXT: s_mul_hi_u32 s13, s1, s14 ; GFX1010-NEXT: s_mul_hi_u32 s15, s0, s14 @@ -2326,76 +2314,76 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1010-NEXT: s_addc_u32 s0, s0, s11 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s11, s3, s0 -; GFX1010-NEXT: s_mul_hi_u32 s12, s3, s1 +; GFX1010-NEXT: s_mul_i32 s11, s9, s0 +; GFX1010-NEXT: s_mul_hi_u32 s12, s9, s1 ; GFX1010-NEXT: s_mul_i32 s10, s10, s1 ; GFX1010-NEXT: s_add_i32 s11, s12, s11 -; GFX1010-NEXT: s_mul_i32 s3, s3, s1 +; GFX1010-NEXT: s_mul_i32 s9, s9, s1 ; GFX1010-NEXT: s_add_i32 s11, s11, s10 -; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s3 -; GFX1010-NEXT: s_mul_i32 s13, s0, s3 -; GFX1010-NEXT: s_mul_hi_u32 s3, s1, s3 +; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s9 +; GFX1010-NEXT: s_mul_i32 s13, s0, s9 +; GFX1010-NEXT: s_mul_hi_u32 s9, s1, s9 ; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s11 ; GFX1010-NEXT: s_mul_i32 s1, s1, s11 ; GFX1010-NEXT: s_mul_hi_u32 s10, s0, s11 -; GFX1010-NEXT: s_add_u32 s1, s3, s1 -; GFX1010-NEXT: s_addc_u32 s3, 0, s14 +; GFX1010-NEXT: s_add_u32 s1, s9, s1 +; GFX1010-NEXT: s_addc_u32 s9, 0, s14 ; GFX1010-NEXT: s_add_u32 s1, s1, s13 ; GFX1010-NEXT: s_mul_i32 s11, s0, s11 -; GFX1010-NEXT: s_addc_u32 s1, s3, s12 -; GFX1010-NEXT: s_addc_u32 s3, s10, 0 +; GFX1010-NEXT: s_addc_u32 s1, s9, s12 +; GFX1010-NEXT: s_addc_u32 s9, s10, 0 ; GFX1010-NEXT: s_add_u32 s1, s1, s11 -; GFX1010-NEXT: s_addc_u32 s3, 0, s3 +; GFX1010-NEXT: s_addc_u32 s9, 0, s9 ; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1 ; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1010-NEXT: s_addc_u32 s0, s0, s3 +; GFX1010-NEXT: s_addc_u32 s0, s0, s9 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 ; GFX1010-NEXT: s_mul_i32 s10, s6, s0 -; GFX1010-NEXT: s_mul_hi_u32 s3, s6, s0 +; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s0 ; GFX1010-NEXT: s_mul_hi_u32 s11, s7, s0 ; GFX1010-NEXT: s_mul_i32 s0, s7, s0 ; GFX1010-NEXT: s_mul_hi_u32 s12, s6, s1 ; GFX1010-NEXT: s_mul_hi_u32 s13, s7, s1 ; GFX1010-NEXT: s_mul_i32 s1, s7, s1 ; GFX1010-NEXT: s_add_u32 s10, s12, s10 -; GFX1010-NEXT: s_addc_u32 s3, 0, s3 +; GFX1010-NEXT: s_addc_u32 s9, 0, s9 ; GFX1010-NEXT: s_add_u32 s1, s10, s1 -; GFX1010-NEXT: s_addc_u32 s1, s3, s13 -; GFX1010-NEXT: s_addc_u32 s3, s11, 0 +; GFX1010-NEXT: s_addc_u32 s1, s9, s13 +; GFX1010-NEXT: s_addc_u32 s9, s11, 0 ; GFX1010-NEXT: s_add_u32 s1, s1, s0 -; GFX1010-NEXT: s_addc_u32 s3, 0, s3 -; GFX1010-NEXT: s_mul_hi_u32 s0, s8, s1 -; GFX1010-NEXT: s_mul_i32 s11, s8, s3 -; GFX1010-NEXT: s_mul_i32 s12, s8, s1 +; GFX1010-NEXT: s_addc_u32 s9, 0, s9 +; GFX1010-NEXT: s_mul_hi_u32 s0, s2, s1 +; GFX1010-NEXT: s_mul_i32 s11, s2, s9 +; GFX1010-NEXT: s_mul_i32 s12, s2, s1 ; GFX1010-NEXT: s_add_i32 s0, s0, s11 ; GFX1010-NEXT: v_sub_co_u32 v0, s11, s6, s12 -; GFX1010-NEXT: s_mul_i32 s10, s9, s1 +; GFX1010-NEXT: s_mul_i32 s10, s3, s1 ; GFX1010-NEXT: s_add_i32 s0, s0, s10 -; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s8 +; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s2 ; GFX1010-NEXT: s_sub_i32 s10, s7, s0 ; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: s_subb_u32 s10, s10, s9 +; GFX1010-NEXT: s_subb_u32 s10, s10, s3 ; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 -; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v1 +; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 ; GFX1010-NEXT: s_subb_u32 s10, s10, 0 -; GFX1010-NEXT: s_cmp_ge_u32 s10, s9 +; GFX1010-NEXT: s_cmp_ge_u32 s10, s3 ; GFX1010-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s10, s9 +; GFX1010-NEXT: s_cmp_eq_u32 s10, s3 ; GFX1010-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1010-NEXT: s_add_u32 s10, s1, 1 ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1010-NEXT: s_addc_u32 s12, s3, 0 +; GFX1010-NEXT: s_addc_u32 s12, s9, 0 ; GFX1010-NEXT: s_add_u32 s13, s1, 2 -; GFX1010-NEXT: s_addc_u32 s14, s3, 0 +; GFX1010-NEXT: s_addc_u32 s14, s9, 0 ; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v0 +; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 ; GFX1010-NEXT: s_subb_u32 s0, s7, s0 ; GFX1010-NEXT: v_mov_b32_e32 v2, s13 -; GFX1010-NEXT: s_cmp_ge_u32 s0, s9 +; GFX1010-NEXT: s_cmp_ge_u32 s0, s3 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX1010-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s0, s9 +; GFX1010-NEXT: s_cmp_eq_u32 s0, s3 ; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s14 @@ -2403,13 +2391,13 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo ; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1010-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo ; GFX1010-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo -; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s2 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 ; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1010-NEXT: .LBB16_2: -; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX1010-NEXT: s_sub_i32 s1, 0, s8 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1010-NEXT: s_sub_i32 s1, 0, s2 ; GFX1010-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2418,17 +2406,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_mul_hi_u32 s1, s0, s1 ; GFX1010-NEXT: s_add_i32 s0, s0, s1 ; GFX1010-NEXT: s_mul_hi_u32 s0, s6, s0 -; GFX1010-NEXT: s_mul_i32 s1, s0, s8 -; GFX1010-NEXT: s_add_i32 s2, s0, 1 +; GFX1010-NEXT: s_mul_i32 s1, s0, s2 +; GFX1010-NEXT: s_add_i32 s3, s0, 1 ; GFX1010-NEXT: s_sub_i32 s1, s6, s1 -; GFX1010-NEXT: s_sub_i32 s3, s1, s8 -; GFX1010-NEXT: s_cmp_ge_u32 s1, s8 -; GFX1010-NEXT: s_cselect_b32 s0, s2, s0 -; GFX1010-NEXT: s_cselect_b32 s1, s3, s1 -; GFX1010-NEXT: s_add_i32 s2, s0, 1 -; GFX1010-NEXT: s_cmp_ge_u32 s1, s8 +; GFX1010-NEXT: s_sub_i32 s6, s1, s2 +; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 +; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 +; GFX1010-NEXT: s_cselect_b32 s1, s6, s1 +; GFX1010-NEXT: s_add_i32 s3, s0, 1 +; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 ; GFX1010-NEXT: s_mov_b32 s1, 0 -; GFX1010-NEXT: s_cselect_b32 s0, s2, s0 +; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: .LBB16_3: @@ -2442,8 +2430,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-LABEL: sudiv64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] ; GFX1030W32-NEXT: s_mov_b32 s8, 0 @@ -2604,8 +2592,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-LABEL: sudiv64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] ; GFX1030W64-NEXT: s_mov_b32 s0, 0 @@ -2765,8 +2753,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-LABEL: sudiv64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] ; GFX11-NEXT: s_mov_b32 s8, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll index 8a39a52cd25eab..8e773cad3b3357 100644 --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -27,7 +27,7 @@ entry: define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -36,7 +36,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; ; GFX900-LABEL: test_kern_stack: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX1010-LABEL: test_kern_stack: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 @@ -266,7 +266,7 @@ entry: define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_stack: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; ; GFX900-LABEL: test_force_fp_kern_stack: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 @@ -287,7 +287,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX1010-LABEL: test_force_fp_kern_stack: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_mov_b32 s33, 0 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 @@ -509,7 +509,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; GFX803-LABEL: test_sgpr_offset_kernel: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -525,7 +525,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; ; GFX900-LABEL: test_sgpr_offset_kernel: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -541,7 +541,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; ; GFX1010-LABEL: test_sgpr_offset_kernel: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b32 s4, 0x20000 ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index b46cdb8ab3ba0a..3c8ea61b0d43b9 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -5,12 +5,12 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: test_loop: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0xa +; GCN-NEXT: s_load_dword s2, s[0:1], 0xa ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, -1 +; GCN-NEXT: s_cmp_eq_u32 s2, -1 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_addk_i32 s0, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, -1 @@ -118,7 +118,7 @@ for.body: define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_true: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_addk_i32 s0, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, -1 @@ -214,7 +214,7 @@ for.body: define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_false: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -303,7 +303,7 @@ for.body: define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_undef: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -393,7 +393,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v0, v0 -; GCN-NEXT: s_load_dword s4, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_bitcmp1_b32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll index b23249570faa7d..1588dde19cfb78 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr add ; ; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index 21e2a85ab18d98..da609bfa8edea6 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add ; ; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index 237e06def15763..12ef7657b19130 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -119,7 +119,7 @@ ret: ; GCN-LABEL: {{^}}sink_ubfe_i16: ; GCN-NOT: lshr -; VI: s_load_dword [[ARG:s[0-9]+]], s[2:3], 0x2c +; VI: s_load_dword [[ARG:s[0-9]+]], s[0:1], 0x2c ; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004 ; GCN: s_cbranch_scc{{[0-1]}} diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index ea10547da6ab7f..397efb126053fc 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -425,9 +425,9 @@ bb: define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) #0 { ; GFX900-LABEL: vload2_private: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s9 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] @@ -456,10 +456,10 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; FLATSCR-LABEL: vload2_private: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: s_mov_b32 s4, 0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] @@ -483,9 +483,9 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; GFX10_DEFAULT-LABEL: vload2_private: ; GFX10_DEFAULT: ; %bb.0: ; %entry -; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v2, 0 -; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s15 +; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s9 ; GFX10_DEFAULT-NEXT: s_addc_u32 s1, s1, 0 ; GFX10_DEFAULT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] @@ -514,11 +514,11 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; FLATSCR_GFX10-LABEL: vload2_private: ; FLATSCR_GFX10: ; %bb.0: ; %entry -; FLATSCR_GFX10-NEXT: s_add_u32 s6, s6, s11 -; FLATSCR_GFX10-NEXT: s_addc_u32 s7, s7, 0 -; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; FLATSCR_GFX10-NEXT: s_add_u32 s2, s2, s5 +; FLATSCR_GFX10-NEXT: s_addc_u32 s3, s3, 0 +; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR_GFX10-NEXT: s_mov_b32 s4, 0 ; FLATSCR_GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -545,7 +545,7 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; GFX11-LABEL: vload2_private: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index e1717a816de0d2..84bd9b6f6c5d48 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_add_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -40,7 +40,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_add_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -51,9 +51,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_add_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -77,7 +75,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_multi_use_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -97,7 +95,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_multi_use_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -117,7 +115,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_multi_use_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -131,14 +129,13 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_multi_use_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v2, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc @@ -161,7 +158,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_dbg_use_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -177,7 +174,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_dbg_use_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -194,7 +191,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_dbg_use_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -205,9 +202,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_dbg_use_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -232,7 +227,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_neg_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -249,7 +244,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_add_neg_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -267,7 +262,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_add_neg_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -279,14 +274,13 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_add_neg_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_floor_f32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -307,7 +301,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_non_clamp_max_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -324,7 +318,7 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_non_clamp_max_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -342,7 +336,7 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_non_clamp_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -354,14 +348,13 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_non_clamp_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -380,7 +373,7 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; SI-LABEL: v_clamp_add_src_f32_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -396,7 +389,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_add_src_f32_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -413,7 +406,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_add_src_f32_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -424,9 +417,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_clamp_add_src_f32_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -450,7 +441,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f16_denorm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -468,7 +459,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_add_src_f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -485,7 +476,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_add_src_f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -496,9 +487,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_add_src_f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -522,7 +511,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; SI-LABEL: v_clamp_add_src_f16_no_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -540,7 +529,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -557,7 +546,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -568,9 +557,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -594,7 +581,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -611,7 +598,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_add_src_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -629,7 +616,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_add_src_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -641,9 +628,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_clamp_add_src_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -668,7 +653,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -684,7 +669,7 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_add_src_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -701,7 +686,7 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_add_src_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -712,9 +697,7 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_add_src_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -738,26 +721,26 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspace(1) %aptr, float %a) #0 { ; SI-LABEL: v_clamp_mac_to_mad: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s0, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mad_f32 v3, s0, s0, v2 clamp +; SI-NEXT: v_mad_f32 v3, s8, s8, v2 clamp ; SI-NEXT: v_add_f32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: v_clamp_mac_to_mad: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -775,31 +758,28 @@ define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_mac_to_mad: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_f32 v2, s0, s0, v1 clamp +; GFX9-NEXT: v_mad_f32 v2, s2, s2, v1 clamp ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_mac_to_mad: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: v_mul_f32_e64 v2, s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e64 v2, v2, v1 clamp -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -822,7 +802,7 @@ define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -846,7 +826,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -866,7 +846,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -877,9 +857,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -903,7 +881,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; SI-LABEL: v_clamp_add_src_v2f16_no_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -927,7 +905,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -947,7 +925,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -958,9 +936,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -984,7 +960,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1016,7 +992,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1038,7 +1014,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1050,14 +1026,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1078,7 +1053,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1103,7 +1078,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1124,7 +1099,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1136,14 +1111,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1166,7 +1140,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1191,7 +1165,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1212,7 +1186,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1224,14 +1198,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1254,7 +1227,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1278,7 +1251,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1298,7 +1271,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1310,14 +1283,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1339,7 +1311,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1362,7 +1334,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1382,7 +1354,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1394,14 +1366,13 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1423,7 +1394,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_packed_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1448,7 +1419,7 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_no_clamp_add_packed_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1469,7 +1440,7 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_no_clamp_add_packed_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1481,14 +1452,13 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_no_clamp_add_packed_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1511,7 +1481,7 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 @@ -1535,7 +1505,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1553,7 +1523,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1566,9 +1536,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 9b6c50c10d90dd..947284506a2970 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -24,7 +24,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -41,7 +41,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -52,9 +52,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -67,9 +65,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -93,7 +89,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -109,7 +105,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -126,7 +122,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -137,9 +133,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_clamp_neg_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -152,9 +146,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -179,7 +171,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -195,7 +187,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -212,7 +204,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -223,9 +215,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX11-LABEL: v_clamp_negabs_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -238,9 +228,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -267,7 +255,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negzero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -285,7 +273,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_negzero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -304,7 +292,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_negzero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -317,14 +305,13 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_negzero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -333,14 +320,13 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_negzero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -363,7 +349,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -381,7 +367,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -400,7 +386,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -413,14 +399,13 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -429,14 +414,13 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX12-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -456,7 +440,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_multi_use_max_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -477,7 +461,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_multi_use_max_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -498,7 +482,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_multi_use_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -513,16 +497,14 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_multi_use_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc @@ -533,16 +515,14 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_multi_use_max_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1 ; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -566,7 +546,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -583,7 +563,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -600,7 +580,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -611,9 +591,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -626,9 +604,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -652,7 +628,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -669,7 +645,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -686,7 +662,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -697,9 +673,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_clamp_neg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -712,9 +686,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -739,7 +711,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -756,7 +728,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -773,7 +745,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -784,9 +756,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX11-LABEL: v_clamp_negabs_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -799,9 +769,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -828,7 +796,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -844,7 +812,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -861,7 +829,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -872,9 +840,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -887,9 +853,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -913,7 +877,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -929,7 +893,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -946,7 +910,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -957,9 +921,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_clamp_neg_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -972,9 +934,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -999,7 +959,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1015,7 +975,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1032,7 +992,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -1043,9 +1003,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX11-LABEL: v_clamp_negabs_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -1058,9 +1016,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -1087,7 +1043,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1104,7 +1060,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX8-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1122,7 +1078,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1134,9 +1090,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1149,9 +1103,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1173,7 +1125,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_aby_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1189,7 +1141,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_aby_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1206,7 +1158,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_aby_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1217,9 +1169,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_aby_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1232,9 +1182,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_aby_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1256,7 +1204,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_bay_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1272,7 +1220,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_bay_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1289,7 +1237,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_bay_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1300,9 +1248,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_bay_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1315,9 +1261,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_bay_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1339,7 +1283,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_yab_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1355,7 +1299,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_yab_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1372,7 +1316,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_yab_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1383,9 +1327,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_yab_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1398,9 +1340,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_yab_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1422,7 +1362,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_yba_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1438,7 +1378,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_yba_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1455,7 +1395,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_yba_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1466,9 +1406,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_yba_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1481,9 +1419,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_yba_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1505,7 +1441,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_ayb_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1521,7 +1457,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_ayb_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1538,7 +1474,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_ayb_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1549,9 +1485,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_ayb_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1564,9 +1498,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_ayb_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1588,7 +1520,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_bya_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1604,7 +1536,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_bya_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1621,7 +1553,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_bya_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1632,9 +1564,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_bya_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1647,9 +1577,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_bya_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1671,7 +1599,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constants_to_one_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1683,7 +1611,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX8-LABEL: v_clamp_constants_to_one_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1695,7 +1623,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX9-LABEL: v_clamp_constants_to_one_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1704,10 +1632,8 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX11-LABEL: v_clamp_constants_to_one_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1716,10 +1642,8 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX12-LABEL: v_clamp_constants_to_one_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1735,7 +1659,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constants_to_zero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1746,7 +1670,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_clamp_constants_to_zero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1758,7 +1682,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_clamp_constants_to_zero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1767,10 +1691,8 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_clamp_constants_to_zero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1779,10 +1701,8 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_clamp_constants_to_zero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1798,7 +1718,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_preserve_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1810,7 +1730,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_clamp_constant_preserve_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0.5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1822,7 +1742,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_clamp_constant_preserve_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1831,10 +1751,8 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_clamp_constant_preserve_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1843,10 +1761,8 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_clamp_constant_preserve_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1862,7 +1778,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1874,7 +1790,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1886,7 +1802,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1895,9 +1811,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1907,9 +1821,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX12-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1926,7 +1838,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_qnan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1937,7 +1849,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX8-LABEL: v_clamp_constant_qnan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1949,7 +1861,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: v_clamp_constant_qnan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1958,10 +1870,8 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: v_clamp_constant_qnan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1970,10 +1880,8 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX12-LABEL: v_clamp_constant_qnan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1989,7 +1897,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_snan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2000,7 +1908,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX8-LABEL: v_clamp_constant_snan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2012,7 +1920,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: v_clamp_constant_snan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2021,10 +1929,8 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: v_clamp_constant_snan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2033,10 +1939,8 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX12-LABEL: v_clamp_constant_snan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2056,7 +1960,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2073,7 +1977,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2091,7 +1995,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2103,14 +2007,13 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2119,9 +2022,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2146,7 +2047,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; GFX6-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2162,7 +2063,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2179,7 +2080,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2190,9 +2091,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2205,9 +2104,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2232,7 +2129,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2249,7 +2146,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2267,7 +2164,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2279,14 +2176,13 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2295,9 +2191,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2321,7 +2215,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2338,7 +2232,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2356,7 +2250,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2368,14 +2262,13 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2384,9 +2277,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2411,7 +2302,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2427,7 +2318,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2444,7 +2335,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2455,9 +2346,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2470,9 +2359,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2494,7 +2381,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2510,7 +2397,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2527,7 +2414,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2538,9 +2425,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2553,9 +2438,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2577,7 +2460,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2593,7 +2476,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2610,7 +2493,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2621,9 +2504,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2636,9 +2517,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2660,7 +2539,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2676,7 +2555,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2693,7 +2572,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2704,9 +2583,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2719,9 +2596,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2743,7 +2618,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2759,7 +2634,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2776,7 +2651,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2787,9 +2662,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2802,9 +2675,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2826,7 +2697,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2842,7 +2713,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2859,7 +2730,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2870,9 +2741,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2885,9 +2754,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2909,7 +2776,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 { ; GFX6-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2921,7 +2788,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2933,7 +2800,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2942,9 +2809,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2954,10 +2819,8 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX12-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2973,7 +2836,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 { ; GFX6-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2985,7 +2848,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800001 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2997,7 +2860,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3006,9 +2869,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3018,10 +2879,8 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX12-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3037,7 +2896,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3059,7 +2918,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3078,7 +2937,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3089,9 +2948,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3104,9 +2961,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: v_clamp_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3130,7 +2985,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_elt: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3156,7 +3011,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_v2f16_undef_elt: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3180,7 +3035,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_v2f16_undef_elt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3191,9 +3046,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_v2f16_undef_elt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3206,9 +3059,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_clamp_v2f16_undef_elt: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3232,7 +3083,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_not_zero: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3256,7 +3107,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: v_clamp_v2f16_not_zero: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3277,7 +3128,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_clamp_v2f16_not_zero: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3290,16 +3141,14 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_clamp_v2f16_not_zero: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3308,16 +3157,14 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX12-LABEL: v_clamp_v2f16_not_zero: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 2.0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3337,7 +3184,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_not_one: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3360,7 +3207,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_v2f16_not_one: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3381,7 +3228,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_v2f16_not_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3394,16 +3241,14 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_clamp_v2f16_not_one: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3412,16 +3257,14 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX12-LABEL: v_clamp_v2f16_not_one: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3441,7 +3284,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3464,7 +3307,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: v_clamp_neg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3483,7 +3326,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_clamp_neg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3494,9 +3337,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: v_clamp_neg_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3509,9 +3350,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: v_clamp_neg_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3536,7 +3375,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3559,7 +3398,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_negabs_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3578,7 +3417,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_negabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3590,14 +3429,13 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_negabs_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3606,14 +3444,13 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_negabs_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3636,7 +3473,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neglo_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3659,7 +3496,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_neglo_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3678,7 +3515,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_neglo_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3689,9 +3526,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_neglo_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3704,9 +3539,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_neglo_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3733,7 +3566,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neghi_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3755,7 +3588,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_neghi_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3774,7 +3607,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_neghi_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3785,9 +3618,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_neghi_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3800,9 +3631,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_neghi_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3829,7 +3658,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_shuffle: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3851,7 +3680,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_v2f16_shuffle: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3870,7 +3699,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_v2f16_shuffle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3881,9 +3710,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_clamp_v2f16_shuffle: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3896,9 +3723,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX12-LABEL: v_clamp_v2f16_shuffle: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3923,7 +3748,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3949,7 +3774,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3973,7 +3798,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3984,9 +3809,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3999,9 +3822,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4025,7 +3846,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4051,7 +3872,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4075,7 +3896,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -4086,9 +3907,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4101,9 +3920,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4127,7 +3944,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 ; GFX6-LABEL: v_clamp_diff_source_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_load_dword s2, s[2:3], 0x2 @@ -4144,7 +3961,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_diff_source_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -4163,7 +3980,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_diff_source_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 @@ -4179,7 +3996,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_diff_source_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -4197,7 +4014,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_clamp_diff_source_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index fad1d47f55fd79..b6948dab6bf9f2 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -30,7 +30,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -49,7 +49,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX10-LABEL: cluster_load_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s4, s0, 8 ; GFX10-NEXT: s_addc_u32 s5, s1, 0 @@ -96,7 +96,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX11-LABEL: cluster_load_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 @@ -155,7 +155,7 @@ bb: define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_valu_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -175,7 +175,7 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX10-LABEL: cluster_load_valu_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s4, s0, 8 ; GFX10-NEXT: s_addc_u32 s5, s1, 0 @@ -223,7 +223,7 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX11-LABEL: cluster_load_valu_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll index dcd088e2bd9886..9edf5663359254 100644 --- a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll @@ -9,7 +9,7 @@ ; GCN-NEXT: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}} ; GCN-NEXT: global_store_dwordx2 v{{[0-9]+}}, v[[[LO]]:{{[0-9]+\]}}, s[{{[0-9:]+}}] -define amdgpu_kernel void @test_odd_int4(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { +define amdgpu_kernel void @test_odd_int4(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i32 %lid @@ -24,7 +24,7 @@ bb: ; GCN: global_load_dwordx2 v[{{[0-9]*[02468]}}:{{[0-9]+}}], ; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}} ; GCN: global_store_dwordx4 v[{{[0-9]*[02468]:[0-9]*[13579]}}], v[{{[0-9]*[02468]:[0-9]*[13579]}}] -define amdgpu_kernel void @test_vector_creation() #0 { +define amdgpu_kernel void @test_vector_creation() { entry: %tmp231 = load <4 x i16>, ptr addrspace(1) undef, align 2 %vext466 = shufflevector <4 x i16> %tmp231, <4 x i16> undef, <8 x i32> @@ -35,5 +35,3 @@ entry: } declare i32 @llvm.amdgcn.workitem.id.x() - -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll index 3035a8579c8a6d..9321bc262c4a49 100644 --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -12,7 +12,7 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 6 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 @@ -31,7 +31,7 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 6 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll index 9d93609b1e8813..aa1ad16b2a56e1 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll @@ -1,9 +1,30 @@ ; REQUIRES: asserts -; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s -; RUN: not --crash llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV4 %s +; RUN: not llc --crash -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV5,COV56 %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV6,COV56 %s -; CHECK: function must have been generated already +; AMDGPUAttributor deletes the function "by accident" so it's never +; codegened with optimizations. +; OPT: .text +; OPT-NEXT: .section ".note.GNU-stack" +; OPT-NEXT: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" +; COV4-NEXT: .amdhsa_code_object_version 4 +; COV5-NEXT: .amdhsa_code_object_version 5 +; COV6-NEXT: .amdhsa_code_object_version 6 +; OPT-NEXT: .amdgpu_metadata +; OPT-NEXT: --- +; OPT-NEXT: amdhsa.kernels: [] +; OPT-NEXT: amdhsa.target: amdgcn-amd-amdhsa--gfx900 +; OPT-NEXT: amdhsa.version: +; OPT-NEXT: - 1 +; COV4: - 1 +; COV56: - 2 +; OPT: ... define internal i32 @func() { ret i32 0 } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 75f5eda608e80a..6bc8d29b3bf7c2 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -13,7 +13,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -180,7 +180,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_4 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -380,7 +380,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: nested_if_if_else: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -614,7 +614,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: nested_if_else_if: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -911,10 +911,10 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-LABEL: s_endpgm_unsafe_barrier: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %bb.then -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -922,7 +922,7 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v1, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: .LBB4_2: ; %bb.end -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_barrier ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index df223b3ec1354d..33c0d90f94a397 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -88,7 +88,7 @@ bb: define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: sub1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: sub1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -127,33 +127,33 @@ bb: define amdgpu_kernel void @add_adde(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: add_adde: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v4, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: add_adde: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v3, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -171,33 +171,33 @@ bb: define amdgpu_kernel void @adde_add(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: adde_add: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: adde_add: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -215,33 +215,33 @@ bb: define amdgpu_kernel void @sub_sube(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sube: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -259,35 +259,35 @@ bb: define amdgpu_kernel void @sub_sube_commuted(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube_commuted: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v4, vcc -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sube_commuted: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x64, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -306,33 +306,33 @@ bb: define amdgpu_kernel void @sube_sub(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sube_sub: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sube_sub: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -350,33 +350,33 @@ bb: define amdgpu_kernel void @zext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: zext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-NEXT: v_cmp_class_f32_e32 vcc, s4, v3 +; GCN-NEXT: v_cmp_class_f32_e32 vcc, s0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: zext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -392,33 +392,33 @@ bb: define amdgpu_kernel void @sext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: sext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-NEXT: v_cmp_class_f32_e32 vcc, s4, v3 +; GCN-NEXT: v_cmp_class_f32_e32 vcc, s0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -434,7 +434,7 @@ bb: define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add_and: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_max_u32_e32 v1, 1, v1 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 @@ -478,7 +478,7 @@ bb: define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_sext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -493,7 +493,7 @@ define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_sext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -518,7 +518,7 @@ bb: define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_zext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -533,7 +533,7 @@ define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_zext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,33 +557,33 @@ bb: define amdgpu_kernel void @sub_addcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_addcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_addcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -601,33 +601,33 @@ bb: define amdgpu_kernel void @sub_subcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_subcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_subcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -646,7 +646,7 @@ bb: define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_zext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -665,7 +665,7 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_zext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -695,7 +695,7 @@ bb: define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_sext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -714,7 +714,7 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_sext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll index 5fbcd0bf669995..3a7100c5903ebb 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -5,12 +5,12 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 { ; CHECK-LABEL: _Z11test_kernelPii: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x2 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 3 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %if.then -; CHECK-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; CHECK-NEXT: s_and_b32 s4, s0, 0xffff ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: s_mul_i32 s6, s4, 0xaaab diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll index 48bd8f9b80799b..c27e44609c527f 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadCombine: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadShuffle: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index 9e5dbe91504a0c..e9dbce9026ca04 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -26,7 +26,7 @@ define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_copy_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -48,40 +48,41 @@ define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 @@ -103,7 +104,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -128,7 +129,7 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa ; ; VI-LABEL: test_copy_v4i8_x3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 @@ -162,51 +163,51 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s18, s6 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s19, s7 -; SI-NEXT: s_mov_b32 s22, s6 -; SI-NEXT: s_mov_b32 s23, s7 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 -; SI-NEXT: s_mov_b32 s16, s12 -; SI-NEXT: s_mov_b32 s17, s13 -; SI-NEXT: s_mov_b32 s20, s14 -; SI-NEXT: s_mov_b32 s21, s15 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s16, s8 +; SI-NEXT: s_mov_b32 s17, s9 +; SI-NEXT: s_mov_b32 s20, s10 +; SI-NEXT: s_mov_b32 s21, s11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 @@ -240,22 +241,23 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 @@ -271,23 +273,23 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_extra_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 @@ -324,7 +326,7 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -363,7 +365,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p ; ; VI-LABEL: test_copy_v4i8_x2_extra_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 @@ -411,7 +413,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -431,7 +433,7 @@ define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -455,7 +457,7 @@ define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -475,7 +477,7 @@ define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -500,7 +502,7 @@ define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -523,7 +525,7 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -550,7 +552,7 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_volatile_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -567,7 +569,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, p ; ; VI-LABEL: test_copy_v4i8_volatile_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -589,7 +591,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, p define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_volatile_store: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -616,7 +618,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ; ; VI-LABEL: test_copy_v4i8_volatile_store: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll index 95d28c9749522d..a0e76f9a47a8a4 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll @@ -6,44 +6,44 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %e, ptr addrspace(1) %f, ptr addrspace(1) %pout.coerce) { ; RRLIST-LABEL: sccClobber: ; RRLIST: ; %bb.0: ; %entry -; RRLIST-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; RRLIST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; RRLIST-NEXT: v_mov_b32_e32 v2, 0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) ; RRLIST-NEXT: s_load_dword s16, s[8:9], 0x0 -; RRLIST-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; RRLIST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; RRLIST-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x44 +; RRLIST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44 ; RRLIST-NEXT: s_load_dword s17, s[10:11], 0x0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) ; RRLIST-NEXT: s_min_i32 s4, s16, 0 -; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; RRLIST-NEXT: s_and_b64 s[2:3], vcc, exec -; RRLIST-NEXT: s_cselect_b32 s2, s16, s17 -; RRLIST-NEXT: s_cmp_eq_u64 s[12:13], s[0:1] -; RRLIST-NEXT: s_cselect_b32 s0, s4, s2 +; RRLIST-NEXT: s_and_b64 s[0:1], vcc, exec +; RRLIST-NEXT: s_cselect_b32 s0, s16, s17 +; RRLIST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3] +; RRLIST-NEXT: s_cselect_b32 s0, s4, s0 ; RRLIST-NEXT: v_mov_b32_e32 v0, s0 ; RRLIST-NEXT: global_store_dword v2, v0, s[14:15] ; RRLIST-NEXT: s_endpgm ; ; FAST-LABEL: sccClobber: ; FAST: ; %bb.0: ; %entry -; FAST-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; FAST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; FAST-NEXT: v_mov_b32_e32 v2, 0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) ; FAST-NEXT: s_load_dword s16, s[8:9], 0x0 -; FAST-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; FAST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; FAST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; FAST-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x44 +; FAST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44 ; FAST-NEXT: s_load_dword s17, s[10:11], 0x0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) ; FAST-NEXT: s_min_i32 s4, s16, 0 -; FAST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; FAST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; FAST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; FAST-NEXT: s_and_b64 s[2:3], vcc, exec -; FAST-NEXT: s_cselect_b32 s2, s16, s17 -; FAST-NEXT: s_cmp_eq_u64 s[12:13], s[0:1] -; FAST-NEXT: s_cselect_b32 s0, s4, s2 +; FAST-NEXT: s_and_b64 s[0:1], vcc, exec +; FAST-NEXT: s_cselect_b32 s0, s16, s17 +; FAST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3] +; FAST-NEXT: s_cselect_b32 s0, s4, s0 ; FAST-NEXT: v_mov_b32_e32 v0, s0 ; FAST-NEXT: global_store_dword v2, v0, s[14:15] ; FAST-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll index c57ee9cc6a1e2d..7dd95a02f136b7 100644 --- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll @@ -4,12 +4,12 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(4) %addrSrc) { ; GCN-LABEL: copy_to_scc: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:252 ; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 63b9d68123fa41..4decf39d040134 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -23,11 +23,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s2, s4 +; SI-NEXT: s_flbit_i32_b32 s2, s2 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -36,8 +36,8 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -62,36 +62,36 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_ctlz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b32 s2, s4 -; GFX10-NEXT: s_min_u32 s2, s2, 32 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_flbit_i32_b32 s0, s4 +; GFX10-NEXT: s_min_u32 s0, s0, 32 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b32 s2, s4 -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_flbit_i32_b32 s0, s4 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u32 s2, s4 +; GFX11-NEXT: s_clz_i32_u32 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, 32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -107,7 +107,7 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -127,7 +127,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -164,7 +164,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_ctlz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -190,14 +190,13 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-LABEL: v_ctlz_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -214,7 +213,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -236,7 +235,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -278,7 +277,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -293,7 +292,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_ctlz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -308,11 +307,9 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-LABEL: v_ctlz_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -336,7 +333,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -362,7 +359,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -414,7 +411,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -433,7 +430,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_ctlz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -452,11 +449,9 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-LABEL: v_ctlz_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -485,7 +480,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -505,7 +500,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_ctlz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -555,7 +550,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_ctlz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -568,7 +563,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-GISEL-LABEL: v_ctlz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -581,7 +576,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX11-LABEL: v_ctlz_i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] @@ -603,8 +598,8 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -617,8 +612,8 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -650,11 +645,11 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_ctlz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b64 s0, s[0:1] +; GFX10-NEXT: s_flbit_i32_b64 s0, s[2:3] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] @@ -663,12 +658,12 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-LABEL: s_ctlz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[0:1] -; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -678,14 +673,14 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX11-LABEL: s_ctlz_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u64 s0, s[0:1] +; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_u32 s0, s0, 64 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: global_store_b64 v1, v[0:1], s[2:3] +; GFX11-NEXT: s_min_u32 s2, s2, 64 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -697,7 +692,7 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -711,7 +706,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -742,7 +737,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -753,7 +748,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -764,7 +759,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX11-LABEL: s_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -783,7 +778,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -804,7 +799,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -852,7 +847,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -867,7 +862,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_ctlz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -883,9 +878,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-LABEL: v_ctlz_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -912,7 +905,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -933,7 +926,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -981,7 +974,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -996,7 +989,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1012,20 +1005,18 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX11-LABEL: v_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp -; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp +; GFX11-NEXT: v_min3_u32 v1, v1, v2, 64 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1042,7 +1033,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1061,7 +1052,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1099,7 +1090,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1111,7 +1102,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1126,10 +1117,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1151,7 +1140,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1170,7 +1159,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1208,7 +1197,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1220,7 +1209,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1235,10 +1224,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1261,7 +1248,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1283,7 +1270,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1326,7 +1313,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1341,7 +1328,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1356,16 +1343,14 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1385,7 +1370,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1407,7 +1392,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1450,7 +1435,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1465,7 +1450,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1480,16 +1465,14 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1509,7 +1492,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1527,7 +1510,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1569,7 +1552,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1580,7 +1563,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1600,8 +1583,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1623,7 +1606,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1641,7 +1624,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1691,7 +1674,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1706,7 +1689,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1722,7 +1705,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1750,7 +1733,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1769,7 +1752,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1812,7 +1795,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1824,7 +1807,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1846,14 +1829,13 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0 ; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index f16f05811c185a..d269eb680138bb 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -29,11 +29,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s4 +; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -41,10 +41,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s4 +; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -64,13 +64,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, ptr addrspace(1) %out, align 4 @@ -80,7 +80,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -99,7 +99,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -134,7 +134,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -154,7 +154,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -211,7 +211,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -232,7 +232,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -254,7 +254,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -295,7 +295,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -318,11 +318,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 24 +; SI-NEXT: s_lshl_b32 s2, s2, 24 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -331,10 +331,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 24 +; VI-NEXT: s_lshl_b32 s2, s2, 24 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -373,14 +373,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 24 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 24 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i8 %val, 0 @@ -392,11 +392,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_lshl_b32 s2, s2, 16 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -405,10 +405,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 16 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -447,14 +447,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 16 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 16 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i16 %val, 0 @@ -466,11 +466,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s4 +; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -478,10 +478,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s4 +; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -501,13 +501,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i32 %val, 0 @@ -519,7 +519,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -533,7 +533,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -561,7 +561,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -580,7 +580,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -601,7 +601,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -649,7 +649,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -672,7 +672,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -697,7 +697,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -753,7 +753,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -778,7 +778,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -809,7 +809,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -869,7 +869,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -899,7 +899,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -946,7 +946,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1050,7 +1050,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1113,7 +1113,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctlz_zero_undef_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1158,7 +1158,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1183,8 +1183,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,14 +1196,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s0, s[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1225,14 +1225,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) @@ -1243,7 +1243,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: s_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1318,7 +1318,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,7 +1364,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -1388,7 +1388,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: v_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1454,7 +1454,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1498,7 +1498,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1534,7 +1534,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1558,7 +1558,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1577,7 +1577,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1637,7 +1637,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1655,7 +1655,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1697,7 +1697,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1726,7 +1726,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1750,7 +1750,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1799,7 +1799,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1850,7 +1850,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1888,7 +1888,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1913,7 +1913,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1934,7 +1934,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1972,7 +1972,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1997,7 +1997,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2018,7 +2018,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2057,7 +2057,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -2082,7 +2082,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2103,7 +2103,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2142,7 +2142,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -2196,11 +2196,11 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) { define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 14 +; SI-NEXT: s_lshl_b32 s2, s2, 14 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -2213,10 +2213,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 14 +; VI-NEXT: s_lshl_b32 s2, s2, 14 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2270,18 +2270,18 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 14 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 -; GFX9-GISEL-NEXT: s_and_b32 s2, s2, 0x3ffff -; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] offset:2 +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 14 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: s_and_b32 s0, s0, 0x3ffff +; GFX9-GISEL-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone store i18 %ctlz, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 40929d58834472..b6359f18169799 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -14,8 +14,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -27,8 +27,8 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) ; ; VI-LABEL: s_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -72,7 +72,7 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -91,7 +91,7 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -142,8 +142,8 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: v_ctpop_add_chain_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -166,8 +166,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctpop_add_chain_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -239,8 +239,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %sval) nounwind { ; SI-LABEL: v_ctpop_add_sgpr_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -259,8 +259,8 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctpop_add_sgpr_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -320,7 +320,7 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -344,7 +344,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -400,7 +400,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -430,7 +430,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -520,7 +520,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v8i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -562,7 +562,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v16i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -769,7 +769,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: v_ctpop_v16i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal ; ; VI-LABEL: v_ctpop_i16_add_inline_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) ; ; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1160,7 +1160,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_literal: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1180,7 +1180,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_literal: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_movk_i32 s4, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1234,8 +1234,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -1254,8 +1254,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt ; ; VI-LABEL: v_ctpop_i16_add_var: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1315,8 +1315,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_var_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind { ; SI-LABEL: v_ctpop_i16_add_vvar_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1418,8 +1418,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v_ctpop_i16_add_vvar_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1487,8 +1487,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %ctpop_arg, i16 %cond) { ; SI-LABEL: ctpop_i16_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: s_cmp_lg_u32 s5, 0 @@ -1517,8 +1517,8 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: ctpop_i16_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: s_cmp_lg_u32 s5, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 1c16612bed37fc..131ce14a7847c8 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -16,8 +16,8 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,8 +28,8 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -92,8 +92,8 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind { ; SI-LABEL: v_ctpop_i64_user: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -115,8 +115,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctpop_i64_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -144,8 +144,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64 ; ; VI-LABEL: s_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -178,38 +178,38 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64 define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] -; SI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] -; SI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; SI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] +; SI-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; SI-NEXT: s_bcnt1_i32_b64 s7, s[10:11] +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] -; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] -; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] +; VI-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; VI-NEXT: s_bcnt1_i32_b64 s7, s[10:11] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> @@ -220,7 +220,7 @@ define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64 define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -242,7 +242,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -270,7 +270,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -298,7 +298,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -334,11 +334,11 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) { ; SI-LABEL: ctpop_i64_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd +; SI-NEXT: s_load_dword s8, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 @@ -363,11 +363,11 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: ctpop_i64_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dword s8, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s8, 0 ; VI-NEXT: s_cbranch_scc0 .LBB7_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 @@ -409,8 +409,8 @@ endif: define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind { ; SI-LABEL: s_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -423,8 +423,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val ; ; VI-LABEL: s_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -443,8 +443,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind { ; SI-LABEL: s_ctpop_i65: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -460,8 +460,8 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) ; ; VI-LABEL: s_ctpop_i65: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -484,7 +484,7 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -507,7 +507,7 @@ define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index 02b0b1cc28fa86..ee2894a66fbfcc 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -22,11 +22,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s2, s4 +; SI-NEXT: s_ff1_i32_b32 s2, s2 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -61,27 +61,27 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_cttz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b32 s2, s4 -; GFX10-NEXT: s_min_u32 s2, s2, 32 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_ff1_i32_b32 s0, s4 +; GFX10-NEXT: s_min_u32 s0, s0, 32 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone store i32 %cttz, ptr addrspace(1) %out, align 4 @@ -91,7 +91,7 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -111,7 +111,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -148,7 +148,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_cttz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -204,7 +204,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -246,7 +246,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -261,7 +261,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_cttz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -284,7 +284,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -310,7 +310,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -362,7 +362,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -381,7 +381,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_cttz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -427,7 +427,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_cttz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -475,7 +475,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_cttz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -487,7 +487,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-GISEL-LABEL: v_cttz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -505,8 +505,8 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -519,8 +519,8 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -552,11 +552,11 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_cttz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b64 s0, s[0:1] +; GFX10-NEXT: s_ff1_i32_b64 s0, s[2:3] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] @@ -565,12 +565,12 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-LABEL: s_cttz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[0:1] -; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -584,7 +584,7 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -598,7 +598,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -629,7 +629,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -640,7 +640,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-GISEL-LABEL: s_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -657,7 +657,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -678,7 +678,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -726,7 +726,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -741,7 +741,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_cttz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -766,7 +766,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -787,7 +787,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -835,7 +835,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -850,7 +850,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-GISEL-LABEL: v_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -876,7 +876,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -895,7 +895,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -933,7 +933,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -945,7 +945,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -970,7 +970,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -989,7 +989,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,7 +1039,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1065,7 +1065,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1087,7 +1087,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1130,7 +1130,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1145,7 +1145,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1192,7 +1192,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1235,7 +1235,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1250,7 +1250,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1275,7 +1275,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1293,7 +1293,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1375,7 +1375,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1393,7 +1393,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1456,7 +1456,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1499,7 +1499,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1542,7 +1542,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1554,7 +1554,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 2491abe4bc1cee..392a44318b0a5b 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -16,11 +16,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -28,10 +28,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -51,13 +51,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, ptr addrspace(1) %out, align 4 @@ -67,7 +67,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -86,7 +86,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -121,7 +121,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -141,7 +141,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -219,7 +219,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -305,11 +305,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -317,10 +317,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -356,13 +356,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i8 %val, 0 @@ -374,11 +374,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -386,10 +386,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -425,13 +425,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i16 %val, 0 @@ -443,11 +443,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -455,10 +455,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -478,13 +478,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i32 %val, 0 @@ -496,7 +496,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -510,7 +510,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -538,7 +538,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -577,7 +577,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -622,7 +622,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -644,7 +644,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -668,7 +668,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -721,7 +721,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -745,7 +745,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -776,7 +776,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -836,7 +836,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -866,7 +866,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -913,7 +913,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1017,7 +1017,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1091,7 +1091,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1274,7 +1274,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -1435,7 +1435,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1453,7 +1453,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1498,7 +1498,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1522,7 +1522,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1544,7 +1544,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1597,7 +1597,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 96969a12b2c589..3f513e120e141b 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -900,7 +900,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -933,7 +933,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX10-LABEL: load_i8_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -955,8 +955,8 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX11-LABEL: load_i8_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -976,7 +976,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -996,7 +996,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1013,7 +1013,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v2i8_to_v2f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,11 +1039,9 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v2i8_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1064,7 +1062,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1086,7 +1084,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1104,7 +1102,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v3i8_to_v3f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1132,10 +1130,8 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v3i8_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1157,7 +1153,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1179,7 +1175,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1198,7 +1194,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v4i8_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1228,11 +1224,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v4i8_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1259,7 +1253,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1287,7 +1281,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1318,7 +1312,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1361,10 +1355,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -1396,7 +1388,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -1434,7 +1426,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s8, 0x4000405 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1481,7 +1473,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1529,11 +1521,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2 @@ -1573,21 +1563,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4 @@ -1596,7 +1586,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 @@ -1609,22 +1599,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1653,12 +1643,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[0:1] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 @@ -1714,13 +1704,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_add_nc_u16 v2, v0, 9 @@ -1767,7 +1755,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1806,7 +1794,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1851,7 +1839,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1910,11 +1898,9 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v7i8_to_v7f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6 @@ -1953,7 +1939,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1980,7 +1966,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2004,7 +1990,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v8i8_to_v8f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2044,11 +2030,9 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v8i8_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[8:9], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2077,7 +2061,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2097,7 +2081,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2114,7 +2098,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2140,14 +2124,13 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: i8_zext_inreg_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2166,7 +2149,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2185,7 +2168,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2201,7 +2184,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2225,10 +2208,8 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2252,7 +2233,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -2270,7 +2251,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2285,7 +2266,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: i8_zext_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -2307,8 +2288,8 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: i8_zext_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2329,7 +2310,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2357,7 +2338,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2388,7 +2369,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2431,10 +2412,8 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -2465,7 +2444,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2484,7 +2463,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2500,7 +2479,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte0_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2524,10 +2503,8 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte0_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2548,7 +2525,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2567,7 +2544,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2583,7 +2560,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2607,10 +2584,8 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2632,7 +2607,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2651,7 +2626,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2667,7 +2642,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte2_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2691,10 +2666,8 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte2_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2716,7 +2689,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2735,7 +2708,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2751,7 +2724,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte3_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2775,10 +2748,8 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte3_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2800,7 +2771,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -2820,7 +2791,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -2840,7 +2811,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX10-LABEL: cvt_ubyte0_or_multiuse: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2868,17 +2839,15 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX11-LABEL: cvt_ubyte0_or_multiuse: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index 6799980c184391..fed4b9862dbfb4 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -8,7 +8,7 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -30,7 +30,7 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -52,7 +52,7 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -74,7 +74,7 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -96,7 +96,7 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -118,7 +118,7 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: nand: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -154,7 +154,7 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: max_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -176,7 +176,7 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: max: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -198,7 +198,7 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: min_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -220,7 +220,7 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: min: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -242,7 +242,7 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umax_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -264,7 +264,7 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -286,7 +286,7 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umin_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -308,7 +308,7 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -330,7 +330,7 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: cmpxchg: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 @@ -354,7 +354,7 @@ define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace( define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: xchg: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -376,7 +376,7 @@ define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: inc: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -398,7 +398,7 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: dec: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -420,7 +420,7 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fadd: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -457,7 +457,7 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fsub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -494,7 +494,7 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fmin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 @@ -519,7 +519,7 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fmax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 @@ -544,15 +544,15 @@ define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.swap: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_swap v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -566,15 +566,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -588,15 +588,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -610,15 +610,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -632,15 +632,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -654,15 +654,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -676,15 +676,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -698,15 +698,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -720,15 +720,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -742,15 +742,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -764,15 +764,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.inc: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_inc v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -786,15 +786,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.dec: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_dec v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -808,16 +808,16 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.cmpswap: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -831,17 +831,16 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fadd: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v1, 1.0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v1, v0, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -856,18 +855,17 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fmin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -882,18 +880,17 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fmax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index 297fe7618672e6..1e5ec361d154c5 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) { ; CHECK-LABEL: eggs: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp0_b32 s0, 0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_mov_b32_e32 v7, 0 ; CHECK-NEXT: .LBB0_3: ; %bb41 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x48 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x48 ; CHECK-NEXT: v_mov_b32_e32 v8, s10 ; CHECK-NEXT: v_mov_b32_e32 v9, s11 ; CHECK-NEXT: v_mov_b32_e32 v10, s12 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll index 8fa0068a237cd5..f414565f78f11a 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @eq_t(float %x) { ; GCN-LABEL: eq_t: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -21,7 +21,7 @@ define amdgpu_kernel void @eq_t(float %x) { define amdgpu_kernel void @ne_t(float %x) { ; GCN-LABEL: ne_t: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -38,7 +38,7 @@ define amdgpu_kernel void @ne_t(float %x) { define amdgpu_kernel void @eq_f(float %x) { ; GCN-LABEL: eq_f: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -55,7 +55,7 @@ define amdgpu_kernel void @eq_f(float %x) { define amdgpu_kernel void @ne_f(float %x) { ; GCN-LABEL: ne_f: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index f298a95c63485e..8f31bb1fe0a81c 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -7,11 +7,11 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_0_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_lshl_b32 s4, s2, 16 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -19,33 +19,33 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_0_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshl_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_0_i16: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_lshl_b32 s2, s4, 16 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_lshl_b32 s0, s4, 16 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_0_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s4, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -92,11 +92,11 @@ define i32 @divergent_vec_0_i16(i16 %a) { define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_i16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s4, s2, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -104,33 +104,33 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -177,11 +177,11 @@ define i32 @divergent_vec_i16_0(i16 %a) { define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; GCN-LABEL: uniform_vec_f16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s4, s2, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -189,33 +189,33 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; ; GFX9-LABEL: uniform_vec_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_f16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -262,7 +262,7 @@ define float @divergent_vec_f16_0(half %a) { define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_i16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -277,7 +277,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_i16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -290,7 +290,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_i16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -303,7 +303,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_i16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -361,7 +361,7 @@ define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) { define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_LH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 @@ -376,7 +376,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_LH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3 @@ -386,7 +386,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX906-LABEL: uniform_vec_i16_LH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3 @@ -396,7 +396,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX11-LABEL: uniform_vec_i16_LH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -452,7 +452,7 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_HH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -466,7 +466,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_HH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3 @@ -476,7 +476,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX906-LABEL: uniform_vec_i16_HH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3 @@ -486,7 +486,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX11-LABEL: uniform_vec_i16_HH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -546,7 +546,7 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_f16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -561,7 +561,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_f16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -574,7 +574,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_f16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -587,7 +587,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_f16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -684,10 +684,10 @@ entry: define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ptr addrspace(1) %out) #0 { ; GCN-LABEL: build_vec_v2i16_undeflo_uniform: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u16 v0, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -698,35 +698,35 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ; ; GFX9-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read_u16_d16 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v0, s4 ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_store_dword v1, v0, s[0:1] +; GFX906-NEXT: global_store_dword v1, v0, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: ds_load_u16_d16 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll index d99e9699c27894..8c3155fc5c6ea8 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: uniform_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_i32 s2, s2, s3 @@ -25,7 +25,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: divergent_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -48,7 +48,7 @@ define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(ptr addrspace(1) %out define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: uniform_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_i32 s2, s2, s3 @@ -69,7 +69,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: divergent_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll index ae4d302e04a7cd..c3a6cd5975a779 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i16_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 @@ -55,9 +55,9 @@ define i1 @divergent_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) { define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i32_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s64) from %ir.x.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 @@ -106,9 +106,9 @@ define i1 @divergent_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) { define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i64_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll index 75d9dd924a4d60..b0e1da3b8eecba 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds1align1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_u8 v0, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds2align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -37,7 +37,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds2align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -52,7 +52,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds2align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_u16 v0, v0 @@ -68,7 +68,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds2align2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_u16 v0, v0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds4align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -104,7 +104,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds4align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -130,7 +130,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds4align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b32 v0, v0 @@ -146,7 +146,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds4align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 @@ -160,7 +160,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds4align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -174,7 +174,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds4align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b32 v0, v0 @@ -190,7 +190,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds4align4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b32 v0, v0 @@ -206,7 +206,7 @@ define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds8align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -234,7 +234,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds8align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds8align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 @@ -291,7 +291,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds8align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4 @@ -311,7 +311,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds8align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -331,7 +331,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds8align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 @@ -347,7 +347,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds8align4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 @@ -363,7 +363,7 @@ define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds8align8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 @@ -379,7 +379,7 @@ define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -415,7 +415,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -473,7 +473,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds12align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 @@ -489,7 +489,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 @@ -513,7 +513,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -539,7 +539,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds12align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 @@ -555,7 +555,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-LABEL: ds12align4: ; ALIGNED: ; %bb.0: -; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -569,7 +569,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds12align4: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -583,7 +583,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds12align4: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 @@ -599,7 +599,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align8: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2 @@ -613,7 +613,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align8: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -627,7 +627,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds12align8: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8 @@ -641,7 +641,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds12align8: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 @@ -657,7 +657,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds12align16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b96 v[0:2], v0 @@ -673,7 +673,7 @@ define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %o define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds16align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -716,7 +716,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds16align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -789,7 +789,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds16align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 @@ -805,7 +805,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds16align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 @@ -835,7 +835,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds16align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -867,7 +867,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds16align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 @@ -883,7 +883,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-LABEL: ds16align4: ; ALIGNED: ; %bb.0: -; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -897,7 +897,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds16align4: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3 @@ -911,7 +911,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds16align4: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 @@ -927,7 +927,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds16align8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 @@ -943,7 +943,7 @@ define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align16(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds16align16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b128 v[0:3], v0 diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll index 31bbe6fbbaa143..aa1d44c31606b8 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: ds_read32_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -47,7 +47,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_20: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -91,7 +91,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_400_back: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -136,7 +136,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_8192: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96 @@ -172,7 +172,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] @@ -206,7 +206,7 @@ bb: } ; GCN-LABEL: ds_read64_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -246,7 +246,7 @@ bb: } ; GCN-LABEL: ds_read64_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] @@ -280,7 +280,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -316,7 +316,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_400_back: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -352,7 +352,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_8192: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96 @@ -379,7 +379,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[BASE:v[0-9]+]], vcc, 4, [[BASE]] @@ -406,7 +406,7 @@ bb: } ; GCN-LABEL: ds_write64_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -437,7 +437,7 @@ bb: } ; GCN-LABEL: ds_write64_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[BASE]], vcc, 8, [[BASE]] diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll index 7d75f1947b51af..5814b8a8ceda45 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll @@ -9,7 +9,7 @@ ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8 ; GCN: s_waitcnt lgkmcnt({{[0-9]+}}) -define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { +define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 24 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -37,7 +37,7 @@ define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 -define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { +define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -67,7 +67,7 @@ define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104 -define amdgpu_kernel void @ds_combine_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { +define amdgpu_kernel void @ds_combine_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 24 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -96,7 +96,7 @@ define amdgpu_kernel void @ds_combine_RAW(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104 -define amdgpu_kernel void @ds_combine_WAR_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { +define amdgpu_kernel void @ds_combine_WAR_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -115,5 +115,3 @@ define amdgpu_kernel void @ds_combine_WAR_RAW(ptr addrspace(1) %out, ptr addrspa store float %sum, ptr addrspace(1) %out, align 4 ret void } - -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index 41a9d7999e80a3..7b9b130e1cf796 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -36,9 +36,8 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { ; ; GFX11-LABEL: write_ds_sub0_offset0_global: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:12 ; GFX11-NEXT: s_endpgm @@ -54,7 +53,7 @@ entry: define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 { ; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: s_load_dword s0, s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-NEXT: s_mov_b64 vcc, 0 @@ -74,7 +73,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0 @@ -91,7 +90,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b @@ -107,11 +106,10 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -137,7 +135,7 @@ entry: define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 { ; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: s_load_dword s0, s[0:1], 0x0 ; CI-NEXT: s_mov_b64 vcc, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -156,7 +154,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v4, 0 @@ -172,7 +170,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b @@ -187,7 +185,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0x7b :: v_dual_mov_b32 v3, 0 @@ -235,9 +233,7 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 { ; ; GFX11-LABEL: add_x_shl_max_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 4, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535 ; GFX11-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -279,9 +275,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 ; GFX11-NEXT: s_endpgm @@ -324,9 +319,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 ; GFX11-NEXT: s_endpgm @@ -367,9 +361,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 ; GFX11-NEXT: s_endpgm @@ -414,8 +407,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 { ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:456 @@ -463,9 +455,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 @@ -512,10 +503,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 ; GFX11-NEXT: s_endpgm @@ -531,7 +521,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 { ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: s_load_dword s0, s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 ; CI-NEXT: s_mov_b64 vcc, 0 @@ -552,7 +542,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0 @@ -570,7 +560,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 @@ -588,12 +578,11 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0x7b -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -648,10 +637,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() # ; ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index b72cd7e1d1eca4..777a8f3fef1c17 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -15,7 +15,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -51,7 +51,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -64,7 +64,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -88,7 +88,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:1028 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -102,7 +102,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -126,7 +126,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -142,7 +142,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -184,7 +184,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: s_mov_b32 s2, 0 @@ -202,7 +202,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_barrier ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -245,7 +245,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -261,7 +261,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -301,7 +301,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -319,7 +319,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -352,7 +352,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -370,7 +370,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) % ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -406,7 +406,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -419,7 +419,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -449,7 +449,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -463,7 +463,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -487,7 +487,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -501,7 +501,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -522,11 +522,13 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:34 ; CI-NEXT: ds_read_u8 v3, v1 offset:32 ; CI-NEXT: ds_read_u8 v4, v1 offset:3 @@ -535,13 +537,15 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: ds_read_u8 v7, v1 ; CI-NEXT: ds_read_u8 v8, v1 offset:33 ; CI-NEXT: ds_read_u8 v1, v1 offset:35 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(5) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_or_b32_e32 v6, v6, v7 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -550,7 +554,6 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -558,8 +561,8 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-ALIGNED-LABEL: unaligned_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -582,17 +585,17 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -612,11 +615,13 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_offset_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:11 ; CI-NEXT: ds_read_u8 v3, v1 offset:9 ; CI-NEXT: ds_read_u8 v4, v1 offset:8 @@ -625,13 +630,15 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: ds_read_u8 v7, v1 offset:5 ; CI-NEXT: ds_read_u8 v8, v1 offset:10 ; CI-NEXT: ds_read_u8 v1, v1 offset:12 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(5) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_or_b32_e32 v6, v6, v7 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -640,7 +647,6 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -648,8 +654,8 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; ; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -672,17 +678,17 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 offset:5 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -702,41 +708,44 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_2_simple_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 ; CI-NEXT: ds_read_u16 v2, v1 offset:32 ; CI-NEXT: ds_read_u16 v3, v1 offset:2 ; CI-NEXT: ds_read_u16 v4, v1 ; CI-NEXT: ds_read_u16 v1, v1 offset:34 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_or_b32_e32 v3, v3, v4 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_add_f32_e32 v2, v3, v1 -; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0 ; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 ; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] @@ -744,12 +753,12 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ; ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -772,7 +781,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -785,7 +794,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -808,7 +817,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -821,7 +830,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -845,7 +854,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[1:2], v0 ; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -859,7 +868,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -880,15 +889,15 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_read2_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 ; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:1 ; CI-NEXT: ds_read2_b32 v[3:4], v3 offset0:14 offset1:15 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] @@ -898,13 +907,13 @@ define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: misaligned_read2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 +; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:14 offset1:15 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -929,7 +938,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -941,7 +950,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -959,7 +968,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -971,7 +980,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -991,7 +1000,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b128 v[0:3], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1004,7 +1013,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b128 v[0:3], v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1026,7 +1035,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v2 offset:16384 ; CI-NEXT: ds_read_b64 v[2:3], v2 offset:32760 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1040,7 +1049,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 offset:16384 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:32760 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1059,11 +1068,12 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 { ; CI-LABEL: sgemm_inner_loop_read2_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_lshl_b32 s4, s6, 2 -; CI-NEXT: s_add_i32 s5, s4, 0xc20 -; CI-NEXT: s_addk_i32 s4, 0xc60 -; CI-NEXT: v_mov_b32_e32 v0, s5 -; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; CI-NEXT: s_lshl_b32 s0, s2, 2 +; CI-NEXT: s_add_i32 s1, s0, 0xc20 +; CI-NEXT: s_addk_i32 s0, 0xc60 +; CI-NEXT: v_mov_b32_e32 v0, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 @@ -1071,29 +1081,24 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; CI-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; CI-NEXT: s_waitcnt lgkmcnt(4) +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v0, v0, v1 -; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_add_f32_e32 v0, v0, v2 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 -; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_add_f32_e32 v0, v0, v4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v5 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v0, v0, v6 ; CI-NEXT: v_add_f32_e32 v0, v0, v7 ; CI-NEXT: v_add_f32_e32 v0, v0, v8 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_add_f32_e32 v0, v0, v9 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: sgemm_inner_loop_read2_sequence: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_lshl_b32 s2, s6, 2 +; GFX9-NEXT: s_lshl_b32 s2, s2, 2 ; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 ; GFX9-NEXT: s_addk_i32 s2, 0xc60 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -1104,12 +1109,16 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; GFX9-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(4) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 @@ -1163,28 +1172,28 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %load = load <2 x i32>, ptr addrspace(3) %in, align 4 store <2 x i32> %load, ptr addrspace(1) %out, align 8 @@ -1194,28 +1203,28 @@ define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %load = load i64, ptr addrspace(3) %in, align 4 store i64 %load, ptr addrspace(1) %out, align 8 @@ -1225,8 +1234,8 @@ define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @ds_read_diff_base_interleaving( ; CI-LABEL: ds_read_diff_base_interleaving: ; CI: ; %bb.0: ; %bb -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1256,10 +1265,10 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; ; GFX9-LABEL: ds_read_diff_base_interleaving: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 @@ -1461,7 +1470,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; CI-NEXT: ds_read_u8 v6, v0 offset:66 ; CI-NEXT: ds_read_u8 v0, v0 offset:65 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 @@ -1488,7 +1497,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(7) ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v7 @@ -1505,7 +1514,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v2 offset:65 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 9f191fa69f6549..06908d21e53556 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_one_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_one_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -44,7 +44,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -60,7 +60,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_two_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -85,7 +85,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_volatile_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -105,7 +105,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_f32_volatile_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -131,7 +131,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_volatile_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -151,7 +151,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_f32_volatile_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -182,7 +182,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -199,7 +199,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace ; ; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ; kill: killed $vgpr4 @@ -229,7 +229,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -244,7 +244,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_subreg2_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -268,7 +268,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg4_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 @@ -283,7 +283,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_subreg4_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -307,7 +307,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_max_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -323,7 +323,7 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_max_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -348,7 +348,7 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_too_far_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -368,7 +368,7 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_too_far_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -394,7 +394,7 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_x2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -413,7 +413,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr ; ; GFX9-LABEL: simple_write2_two_val_f32_x2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -450,7 +450,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -469,7 +469,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa ; ; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -506,21 +506,21 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: write2_ptr_subreg_arg_two_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x6 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x6 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[4:5] +; CI-NEXT: s_mov_b64 s[0:1], s[4:5] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: s_mov_b64 s[2:3], s[10:11] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: v_mov_b32_e32 v1, s12 +; CI-NEXT: s_mov_b64 s[4:5], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v1, s8 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: v_mov_b32_e32 v3, s13 +; CI-NEXT: v_mov_b32_e32 v3, s9 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: ds_write_b32 v1, v2 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -529,14 +529,14 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: ds_write_b32 v0, v1 offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -566,7 +566,7 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -580,7 +580,7 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_one_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -601,15 +601,15 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 -; CI-NEXT: s_load_dword s4, s[2:3], 0x4 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[0:1], 0x4 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 @@ -618,11 +618,11 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) ; ; GFX9-LABEL: misaligned_simple_write2_one_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 @@ -642,15 +642,15 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_offset_simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 -; CI-NEXT: s_load_dword s4, s[2:3], 0x4 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[0:1], 0x4 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 @@ -675,11 +675,11 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; ; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 @@ -702,11 +702,11 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:5 @@ -726,7 +726,7 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -742,7 +742,7 @@ define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_two_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc @@ -868,11 +868,11 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb, ptr addrspace(1) %in) #0 { ; CI-LABEL: write2_sgemm_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s0, s[0:1], 0x0 -; CI-NEXT: s_lshl_b32 s1, s6, 2 +; CI-NEXT: s_lshl_b32 s1, s2, 2 ; CI-NEXT: s_add_i32 s2, s1, 0xc20 ; CI-NEXT: s_addk_i32 s1, 0xc60 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -890,8 +890,8 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, ; ; GFX9-LABEL: write2_sgemm_sequence: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX9-NEXT: s_lshl_b32 s2, s6, 2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_add_i32 s1, s2, 0xc20 @@ -945,12 +945,12 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_v4f32_superreg_align4: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 -; CI-NEXT: s_load_dword s4, s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; CI-NEXT: s_load_dword s4, s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s0 @@ -963,11 +963,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 -; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 @@ -979,11 +979,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 -; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3 diff --git a/llvm/test/CodeGen/AMDGPU/early-inline.ll b/llvm/test/CodeGen/AMDGPU/early-inline.ll index 02ab2a065c0ef5..c1a049cf055cf0 100644 --- a/llvm/test/CodeGen/AMDGPU/early-inline.ll +++ b/llvm/test/CodeGen/AMDGPU/early-inline.ll @@ -25,7 +25,6 @@ entry: ; CHECK-LABEL: @alias_caller( ; CHECK-NOT: call -; CHECK: {{^[}]}} define amdgpu_kernel void @alias_caller(i32 %x) { entry: %res = call i32 @c_alias(i32 %x) diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll index 554cb140f42923..d958dde01c3f85 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll @@ -80,11 +80,9 @@ ; R600-NOT: .amd_amdgpu_hsa_metadata ; R600-NOT: .amd_amdgpu_pal_metadata -define amdgpu_kernel void @elf_notes() #0 { +define amdgpu_kernel void @elf_notes() { ret void } -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 32b9f9cb97095f..86ec6269b1c9bc 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -6,10 +6,10 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] ; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1] -; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: s_load_dword s14, s[6:7], 0x4 -; CHECK-NEXT: s_add_u32 s24, s24, s13 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4 +; CHECK-NEXT: s_add_u32 s24, s24, s7 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s2, 0 @@ -24,7 +24,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s14, 8 +; CHECK-NEXT: s_bitcmp1_b32 s6, 8 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17] ; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index 54fb1dc5c05274..c744ace37a8315 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -93,7 +93,7 @@ bb: define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s0, s2, s2 ; GFX7-NEXT: s_cmp_lt_u32 s0, s2 @@ -120,7 +120,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX9-LABEL: s_add_co_br_user: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s0, s2, s2 ; GFX9-NEXT: s_cmp_lt_u32 s0, s2 @@ -146,7 +146,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX10-LABEL: s_add_co_br_user: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s1, s0, s0 ; GFX10-NEXT: s_cmp_lt_u32 s1, s0 @@ -172,7 +172,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX11-LABEL: s_add_co_br_user: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s1, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index ee1df9aa0d6cea..db3ea4df52981c 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -180,8 +180,8 @@ entry: } ; GCN-LABEL: {{^}}float8_extelt: -; GCN-DAG: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-DAG: s_load_dword [[S0:s[0-9]+]], s[2:3], 0x2c +; GCN-DAG: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-DAG: s_load_dword [[S0:s[0-9]+]], s[0:1], 0x2c ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 @@ -411,10 +411,10 @@ entry: ; GCN-LABEL: {{^}}bit4_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s4, 3 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 ; GCN-NEXT: s_lshr_b32 s2, 0x1000100, s2 ; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index f4ec16db55d68a..70011e56d016e0 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: extract_vector_elt_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -36,7 +36,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: extract_vector_elt_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 @@ -62,8 +62,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s0, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s1, s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -77,8 +77,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -95,8 +95,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 @@ -119,8 +119,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -139,15 +139,15 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dword v2, v[1:2] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -162,14 +162,12 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -195,7 +193,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo) #0 { ; SI-LABEL: extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,7 +208,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; VI-LABEL: extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -224,7 +222,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; GFX11-LABEL: extract_vector_elt_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -249,8 +247,8 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b32 s4, s4, 4 @@ -264,8 +262,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: dynamic_extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -280,8 +278,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; GFX11-LABEL: dynamic_extract_vector_elt_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s4, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -302,7 +300,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_extractelement_v4f16_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -319,7 +317,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_extractelement_v4f16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -338,9 +336,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_extractelement_v4f16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -363,7 +359,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -384,7 +380,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -407,21 +403,20 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: buffer_load_b32 v3, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 -; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -439,7 +434,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_01: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -456,7 +451,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_01: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -473,7 +468,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_01: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -500,7 +495,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x1 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -517,7 +512,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -534,7 +529,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_23: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -561,8 +556,8 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 @@ -613,8 +608,8 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -656,46 +651,43 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v4 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v0, s[6:7] +; GFX11-NEXT: global_load_b128 v[1:4], v1, s[6:7] ; GFX11-NEXT: s_cmp_eq_u32 s0, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -712,8 +704,8 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 @@ -802,8 +794,8 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -877,81 +869,78 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[6:7] offset:16 +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7] offset:16 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX11-NEXT: s_cmp_eq_u32 s0, 9 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; GFX11-NEXT: s_cmp_eq_u32 s0, 11 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v7 ; GFX11-NEXT: s_cmp_eq_u32 s0, 13 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 14 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-NEXT: s_cmp_eq_u32 s0, 15 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index d670d69947361c..b69852da247445 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -81,7 +81,7 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(ptr addrspace(1) %out, <3 x ; SI: buffer_store_short ; SI: buffer_store_short -; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[2:3], 0x24 +; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[0:1], 0x24 ; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[#LOAD + 2]] ; GFX89-DAG: buffer_store_short [[VLOAD0]], off ; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[#LOAD + 3]] @@ -100,9 +100,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(ptr addrspace(1) %out, <4 x ; SI: s_load_dwordx2 s ; SI: s_load_dwordx2 s -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[2:3], 0x24 -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[2:3], 0x4c -; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[2:3], 0x54 +; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x24 +; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x4c +; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[0:1], 0x54 ; GCN-NOT: {{buffer|flat|global}} diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll index 164352ef75b3b9..331fe26160d412 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -133,8 +133,8 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x ; isTypeDesirableForOp in SimplifyDemandedBits ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x4c -; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[6:7], 0x28 +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c +; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 ; VI-NOT: {{flat|buffer|global}} ; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 @@ -147,8 +147,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x4c -; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[6:7], 0x28 +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c +; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 ; VI-NOT: {{flat|buffer|global}} ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]] @@ -162,7 +162,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x30 +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x30 ; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 @@ -179,7 +179,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x10 +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x10 ; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll index 06da7eea0b47dc..d5464ce6aa8a33 100644 --- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -8,8 +8,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @bitcast_int_to_vector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %b) { ; GCN-LABEL: bitcast_int_to_vector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s12, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s12, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -38,8 +38,8 @@ define amdgpu_kernel void @bitcast_int_to_vector_extract_0(ptr addrspace(1) %out define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, double %b) { ; GCN-LABEL: bitcast_fp_to_vector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -68,8 +68,8 @@ define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(ptr addrspace(1) %out, define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %b) { ; GCN-LABEL: bitcast_int_to_fpvector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s12, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s12, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -98,7 +98,7 @@ define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(ptr addrspace(1) %o define amdgpu_kernel void @no_extract_volatile_load_extract0(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: no_extract_volatile_load_extract0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -122,7 +122,7 @@ entry: define amdgpu_kernel void @no_extract_volatile_load_extract2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: no_extract_volatile_load_extract2: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -146,8 +146,8 @@ entry: define amdgpu_kernel void @no_extract_volatile_load_dynextract(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { ; GCN-LABEL: no_extract_volatile_load_dynextract: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s12, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s12, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s10, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 21799ab79b8396..f34824cd6cefe1 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: s_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -23,8 +23,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: s_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: s_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -66,8 +66,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: s_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -78,8 +78,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: s_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -90,8 +90,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: s_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -102,10 +102,10 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: s_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -120,8 +120,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -132,8 +132,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; VI-LABEL: s_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -144,8 +144,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; GFX9-LABEL: s_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -156,10 +156,10 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GFX11-LABEL: s_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -174,7 +174,7 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-LABEL: s_fabs_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -187,7 +187,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; VI-LABEL: s_fabs_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -200,7 +200,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX9-LABEL: s_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -212,7 +212,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX11-LABEL: s_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -231,12 +231,12 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) { ; CI-LABEL: fabs_fold_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -247,8 +247,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; VI-LABEL: fabs_fold_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -260,8 +260,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; GFX9-LABEL: fabs_fold_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -273,13 +273,13 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; GFX11-LABEL: fabs_fold_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, |s4|, s2 +; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -293,7 +293,7 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -307,7 +307,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -321,7 +321,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -332,9 +332,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_fabs_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -356,8 +354,8 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-LABEL: fabs_free_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -368,8 +366,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; VI-LABEL: fabs_free_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -380,8 +378,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX9-LABEL: fabs_free_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -392,10 +390,10 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-LABEL: fabs_free_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -413,7 +411,7 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_fold_self_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -439,7 +437,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: v_fabs_fold_self_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -457,7 +455,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_fabs_fold_self_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -470,15 +468,14 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_fabs_fold_self_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -496,8 +493,8 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %other.val) #0 { ; CI-LABEL: v_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -524,8 +521,8 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -545,30 +542,28 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s6 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -587,7 +582,7 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -610,7 +605,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; VI-LABEL: v_extract_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -629,7 +624,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX9-LABEL: v_extract_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -645,9 +640,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX11-LABEL: v_extract_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] @@ -680,7 +673,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_no_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -698,7 +691,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; VI-LABEL: v_extract_fabs_no_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -716,7 +709,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX9-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] @@ -730,9 +723,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX11-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 60e19dcd48f1e6..07581ade57ccd5 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -39,25 +39,25 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fabsf_free: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s4, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fabsf_free: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_bitset0_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_bitset0_b32 s0, 31 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %bc= bitcast i32 %in to float @@ -69,25 +69,25 @@ define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s4, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_bitset0_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_bitset0_b32 s0, 31 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %in) @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fabs_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-LABEL: fabs_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s3, 31 ; VI-NEXT: s_bitset0_b32 s2, 31 @@ -131,26 +131,26 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s7, 31 -; SI-NEXT: s_bitset0_b32 s6, 31 -; SI-NEXT: s_bitset0_b32 s5, 31 -; SI-NEXT: s_bitset0_b32 s4, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_bitset0_b32 s3, 31 +; SI-NEXT: s_bitset0_b32 s2, 31 +; SI-NEXT: s_bitset0_b32 s1, 31 +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_bitset0_b32 s3, 31 @@ -202,7 +202,7 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) { ; SI-LABEL: fabs_fold: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -215,7 +215,7 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i ; ; VI-LABEL: fabs_fold: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0 @@ -232,23 +232,23 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, |s4|, 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_add_f32_e64 v0, |s0|, 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bitpreserve_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_f32_e64 v2, |s2|, 1.0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_add_f32_e64 v2, |s0|, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %in.bc = bitcast float %in to i32 diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll index 7352fcdd071d5b..cdc6b5a48d0a69 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @fadd_f16( ; SI-LABEL: fadd_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s2, s10 @@ -35,8 +35,8 @@ define amdgpu_kernel void @fadd_f16( ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s2, s10 @@ -59,8 +59,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-SDAG-LABEL: fadd_f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s3, s11 @@ -87,8 +87,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-GISEL-LABEL: fadd_f16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: s_mov_b32 s10, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -111,8 +111,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-FAKE16-SDAG-LABEL: fadd_f16: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s10, -1 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s11 @@ -137,8 +137,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-FAKE16-GISEL-LABEL: fadd_f16: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s10, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -196,7 +196,7 @@ entry: define amdgpu_kernel void @fadd_f16_imm_a( ; SI-LABEL: fadd_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -216,7 +216,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; VI-LABEL: fadd_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -234,7 +234,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-SDAG-LABEL: fadd_f16_imm_a: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -256,7 +256,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-GISEL-LABEL: fadd_f16_imm_a: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -274,7 +274,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_a: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -294,7 +294,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_a: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -340,7 +340,7 @@ entry: define amdgpu_kernel void @fadd_f16_imm_b( ; SI-LABEL: fadd_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -360,7 +360,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; VI-LABEL: fadd_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -378,7 +378,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-SDAG-LABEL: fadd_f16_imm_b: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -400,7 +400,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-GISEL-LABEL: fadd_f16_imm_b: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -418,7 +418,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_b: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -438,7 +438,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_b: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -484,8 +484,8 @@ entry: define amdgpu_kernel void @fadd_v2f16( ; SI-LABEL: fadd_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -518,8 +518,8 @@ define amdgpu_kernel void @fadd_v2f16( ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -544,13 +544,11 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-SDAG-LABEL: fadd_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] @@ -567,10 +565,8 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-GISEL-LABEL: fadd_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_clause 0x1 @@ -588,13 +584,11 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 -; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34 +; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] @@ -611,10 +605,8 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 @@ -665,7 +657,7 @@ entry: define amdgpu_kernel void @fadd_v2f16_imm_a( ; SI-LABEL: fadd_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -692,7 +684,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; VI-LABEL: fadd_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -713,12 +705,10 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-SDAG-LABEL: fadd_v2f16_imm_a: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 @@ -732,9 +722,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-GISEL-LABEL: fadd_v2f16_imm_a: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] @@ -749,12 +737,10 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_a: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0 @@ -768,9 +754,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_a: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] @@ -812,7 +796,7 @@ entry: define amdgpu_kernel void @fadd_v2f16_imm_b( ; SI-LABEL: fadd_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -839,7 +823,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; VI-LABEL: fadd_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -860,12 +844,10 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-SDAG-LABEL: fadd_v2f16_imm_b: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 @@ -879,9 +861,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-GISEL-LABEL: fadd_v2f16_imm_b: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] @@ -896,12 +876,10 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_b: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0 @@ -915,9 +893,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_b: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 7252c69cb1cf75..4bfaa6e90bdfee 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -73,7 +73,7 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -89,7 +89,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -99,7 +99,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -108,7 +108,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -117,7 +117,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -127,7 +127,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align2: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -219,7 +219,7 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) @@ -246,7 +246,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -256,7 +256,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -265,7 +265,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -274,7 +274,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -284,7 +284,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -358,7 +358,7 @@ define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -368,7 +368,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -378,7 +378,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -387,7 +387,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -396,7 +396,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -406,7 +406,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align4: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 7af972b96ec68c..581b7b4cff9ed0 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -21,7 +21,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_undef_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_undef_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v0, v0, s[0:1] @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_undef_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_undef_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -76,7 +76,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] @@ -87,7 +87,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; CI-LABEL: v_test_canonicalize_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] @@ -119,10 +119,10 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 { ; VI-LABEL: s_test_canonicalize_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_e64 v2, s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -130,35 +130,34 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; ; GFX9-LABEL: s_test_canonicalize_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f16_e64 v1, s4, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; CI-LABEL: s_test_canonicalize_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0xb -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_canonicalize_var_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, s4, s4 +; GFX11-NEXT: v_max_f16_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -169,6 +168,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ret void } +define half @s_test_canonicalize_arg(half %x) #1 { +; VI-LABEL: s_test_canonicalize_arg: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_test_canonicalize_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: s_test_canonicalize_arg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_test_canonicalize_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %canonicalized = call half @llvm.canonicalize.f16(half %x) + ret half %canonicalized +} + define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 { ; VI-LABEL: v_test_canonicalize_build_vector_v2f16: ; VI: ; %bb.0: @@ -211,7 +239,7 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -223,7 +251,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -234,7 +262,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; CI-LABEL: v_test_canonicalize_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -247,7 +275,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -267,7 +295,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -279,7 +307,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -290,7 +318,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; CI-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -303,7 +331,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -324,7 +352,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -336,7 +364,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -347,7 +375,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; CI-LABEL: v_test_canonicalize_fneg_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -360,7 +388,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -380,7 +408,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #2 { ; VI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -392,7 +420,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -403,7 +431,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; CI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -416,7 +444,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -436,7 +464,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #2 { ; VI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -448,7 +476,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -459,7 +487,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; CI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -472,7 +500,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -493,7 +521,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -503,7 +531,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v0, v0, s[0:1] @@ -511,7 +539,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_p0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -521,7 +549,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] @@ -536,7 +564,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -546,7 +574,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -555,7 +583,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_n0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -565,7 +593,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -580,7 +608,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -590,7 +618,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -599,7 +627,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_p1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3c00 @@ -609,7 +637,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -624,7 +652,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -634,7 +662,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffbc00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -643,7 +671,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_n1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0xbc00 @@ -653,7 +681,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -668,7 +696,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_literal_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -678,7 +706,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -687,7 +715,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; CI-LABEL: test_fold_canonicalize_literal_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x4c00 @@ -697,7 +725,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -712,7 +740,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -722,7 +750,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -731,7 +759,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; CI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff @@ -741,7 +769,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -756,7 +784,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -766,7 +794,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -775,7 +803,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff @@ -785,7 +813,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -800,7 +828,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -810,7 +838,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -819,7 +847,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; CI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff @@ -829,7 +857,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -844,7 +872,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -854,7 +882,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -863,7 +891,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff @@ -873,7 +901,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -888,7 +916,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -898,7 +926,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -907,7 +935,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_qnan_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7c00 @@ -917,7 +945,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -932,7 +960,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -942,7 +970,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -951,7 +979,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -961,7 +989,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -976,7 +1004,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -986,7 +1014,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -995,7 +1023,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1005,7 +1033,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1020,7 +1048,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan0_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1030,7 +1058,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,7 +1067,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan0_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1049,7 +1077,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1064,7 +1092,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan1_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1074,7 +1102,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1083,7 +1111,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan1_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1093,7 +1121,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1108,7 +1136,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan2_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1118,7 +1146,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1127,7 +1155,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan2_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1137,7 +1165,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1152,7 +1180,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan3_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1162,7 +1190,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1171,7 +1199,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan3_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1181,7 +1209,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1196,7 +1224,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1214,7 +1242,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_test_canonicalize_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1226,7 +1254,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; CI-LABEL: v_test_canonicalize_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1249,10 +1277,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_test_canonicalize_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1272,7 +1298,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fabs_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1290,7 +1316,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1303,7 +1329,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; CI-LABEL: v_test_canonicalize_fabs_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1326,14 +1352,13 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1351,7 +1376,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1369,7 +1394,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1382,7 +1407,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; CI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1406,14 +1431,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1432,7 +1456,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1450,7 +1474,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1462,7 +1486,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; CI-LABEL: v_test_canonicalize_fneg_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1486,10 +1510,8 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1510,12 +1532,12 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 { ; VI-LABEL: s_test_canonicalize_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_max_f16_e64 v0, s2, s2 ; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1525,40 +1547,39 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_test_canonicalize_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; CI-LABEL: s_test_canonicalize_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0xb +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_lshr_b32 s3, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_canonicalize_var_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1572,7 +1593,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1582,7 +1603,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_p0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -1590,7 +1611,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_p0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -1600,7 +1621,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_p0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1615,7 +1636,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x80008000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1625,7 +1646,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_n0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x80008000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1634,7 +1655,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_n0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x80008000 @@ -1644,7 +1665,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_n0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1659,7 +1680,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3c003c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1669,7 +1690,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_p1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c003c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1678,7 +1699,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_p1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3c003c00 @@ -1688,7 +1709,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_p1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1703,7 +1724,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xbc00bc00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1713,7 +1734,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_n1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00bc00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1722,7 +1743,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_n1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 @@ -1732,7 +1753,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_n1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1747,7 +1768,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_literal_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4c004c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1757,7 +1778,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; GFX9-LABEL: test_fold_canonicalize_literal_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c004c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1766,7 +1787,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; CI-LABEL: test_fold_canonicalize_literal_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x4c004c00 @@ -1776,7 +1797,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; GFX11-LABEL: test_fold_canonicalize_literal_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1791,7 +1812,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1801,7 +1822,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1810,7 +1831,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; CI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff @@ -1820,7 +1841,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1835,7 +1856,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1845,7 +1866,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1854,7 +1875,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff @@ -1864,7 +1885,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1879,7 +1900,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1889,7 +1910,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1898,7 +1919,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; CI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff @@ -1908,7 +1929,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1923,7 +1944,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1933,7 +1954,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1942,7 +1963,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff @@ -1952,7 +1973,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1967,7 +1988,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7c007c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1977,7 +1998,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; GFX9-LABEL: test_fold_canonicalize_qnan_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c007c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1986,7 +2007,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; CI-LABEL: test_fold_canonicalize_qnan_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7c007c00 @@ -1996,7 +2017,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2011,7 +2032,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2021,7 +2042,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2030,7 +2051,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2040,7 +2061,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2055,7 +2076,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2065,7 +2086,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2074,7 +2095,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2084,7 +2105,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2099,7 +2120,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2109,7 +2130,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2118,7 +2139,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2128,7 +2149,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2143,7 +2164,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2153,7 +2174,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2162,7 +2183,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2172,7 +2193,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2187,7 +2208,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2197,7 +2218,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2206,7 +2227,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2216,7 +2237,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2231,7 +2252,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2241,7 +2262,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2250,7 +2271,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2260,7 +2281,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2355,7 +2376,7 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 { define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: s_test_canonicalize_undef_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2365,7 +2386,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: s_test_canonicalize_undef_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -2373,7 +2394,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: s_test_canonicalize_undef_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -2383,7 +2404,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: s_test_canonicalize_undef_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -2657,7 +2678,7 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 { define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: s_test_canonicalize_undef_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2668,7 +2689,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; GFX9-LABEL: s_test_canonicalize_undef_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2677,7 +2698,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; CI-LABEL: s_test_canonicalize_undef_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 @@ -2688,7 +2709,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; GFX11-LABEL: s_test_canonicalize_undef_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index f0ce96af90649d..d53c0411ad88c1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -23,7 +23,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -35,7 +35,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -46,7 +46,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -59,7 +59,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -78,8 +78,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, float %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[6:7], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -89,8 +89,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX8-LABEL: s_test_canonicalize_var_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX9-LABEL: s_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -111,11 +111,11 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX11-LABEL: s_test_canonicalize_var_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, s4, s4 +; GFX11-NEXT: v_max_f32_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -123,7 +123,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX12-LABEL: s_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -162,7 +162,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -175,7 +175,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -195,7 +195,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -207,7 +207,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -218,7 +218,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -231,7 +231,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -252,7 +252,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -264,7 +264,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -275,7 +275,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -288,7 +288,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -308,7 +308,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_undef_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -318,7 +318,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: test_fold_canonicalize_undef_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -326,7 +326,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_fold_canonicalize_undef_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -336,7 +336,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: test_fold_canonicalize_undef_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -351,7 +351,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -361,7 +361,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -369,7 +369,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -379,7 +379,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -394,7 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -404,7 +404,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -413,7 +413,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -424,7 +424,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -440,7 +440,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -459,7 +459,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -469,7 +469,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -484,7 +484,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -494,7 +494,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, -1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -503,7 +503,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -513,7 +513,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -528,7 +528,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -538,7 +538,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -547,7 +547,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -557,7 +557,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -572,7 +572,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -582,7 +582,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -590,7 +590,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -600,7 +600,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -615,7 +615,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic(ptr addrspace(1) %out) #5 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -626,7 +626,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -636,7 +636,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -647,7 +647,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -663,7 +663,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out(ptr addrspace(1) %out) #6 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -674,7 +674,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -684,7 +684,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -695,7 +695,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -711,7 +711,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in(ptr addrspace(1) %out) #7 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -722,7 +722,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -732,7 +732,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -743,7 +743,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -759,7 +759,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -769,7 +769,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -778,7 +778,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -788,7 +788,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -803,7 +803,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -813,7 +813,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -822,7 +822,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -833,7 +833,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -859,7 +859,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x807fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -878,7 +878,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -893,7 +893,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -903,7 +903,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -912,7 +912,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -922,7 +922,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -937,7 +937,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -947,7 +947,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -956,7 +956,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -966,7 +966,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -981,7 +981,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -991,7 +991,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1000,7 +1000,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1010,7 +1010,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1025,7 +1025,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1069,7 +1069,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1079,7 +1079,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1113,7 +1113,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1142,7 +1142,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1167,7 +1167,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1176,7 +1176,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1186,7 +1186,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1201,7 +1201,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1224,7 +1224,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1237,7 +1237,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, double %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX8-LABEL: s_test_canonicalize_var_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1276,7 +1276,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX9-LABEL: s_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX11-LABEL: s_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1296,7 +1296,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX12-LABEL: s_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], s[2:3], s[2:3] @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1324,7 +1324,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1368,7 +1368,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1380,7 +1380,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1391,7 +1391,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1425,7 +1425,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1437,7 +1437,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1448,7 +1448,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1461,7 +1461,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1492,7 +1492,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1501,7 +1501,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,7 +1541,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1550,7 +1550,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1577,7 +1577,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1588,7 +1588,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1597,7 +1597,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1607,7 +1607,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1622,7 +1622,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1642,7 +1642,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1652,7 +1652,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1667,7 +1667,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1678,7 +1678,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1687,7 +1687,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1697,7 +1697,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1712,7 +1712,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1723,7 +1723,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1744,7 +1744,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1761,7 +1761,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1772,7 +1772,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xfffff @@ -1782,7 +1782,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1793,7 +1793,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1809,7 +1809,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1820,7 +1820,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1840,7 +1840,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1856,7 +1856,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1867,7 +1867,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x800fffff @@ -1877,7 +1877,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1888,7 +1888,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1904,7 +1904,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1915,7 +1915,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1924,7 +1924,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1934,7 +1934,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1949,7 +1949,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1960,7 +1960,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1969,7 +1969,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1979,7 +1979,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2005,7 +2005,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2014,7 +2014,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2024,7 +2024,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2039,7 +2039,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2050,7 +2050,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2069,7 +2069,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2095,7 +2095,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2104,7 +2104,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2114,7 +2114,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2129,7 +2129,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2140,7 +2140,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2159,7 +2159,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2174,7 +2174,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2185,7 +2185,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2194,7 +2194,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2204,7 +2204,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2219,7 +2219,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f64_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2236,7 +2236,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f64_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f64_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2264,9 +2264,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f64_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2279,9 +2277,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f64_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2303,7 +2299,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f32_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2320,7 +2316,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f32_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2337,7 +2333,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f32_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2348,9 +2344,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f32_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2363,9 +2357,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f32_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2387,7 +2379,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2405,7 +2397,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2422,7 +2414,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2433,9 +2425,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2448,9 +2438,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2473,7 +2461,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_v2f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2496,7 +2484,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2516,7 +2504,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2527,9 +2515,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2542,9 +2528,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2566,7 +2550,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f64_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2583,7 +2567,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f64_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2600,7 +2584,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f64_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2611,9 +2595,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f64_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2626,9 +2608,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f64_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2650,7 +2630,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f32_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2667,7 +2647,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f32_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2684,7 +2664,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f32_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2695,9 +2675,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f32_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2710,9 +2688,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f32_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2735,7 +2711,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2753,7 +2729,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2770,7 +2746,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2781,9 +2757,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2796,9 +2770,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2822,7 +2794,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2845,7 +2817,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2864,7 +2836,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2875,9 +2847,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2890,9 +2860,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2914,7 +2882,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) #1 { ; GFX6-LABEL: v_test_canonicalize_var_v2f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2931,7 +2899,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_test_canonicalize_var_v2f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2948,7 +2916,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_test_canonicalize_var_v2f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2961,11 +2929,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_test_canonicalize_var_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2978,11 +2944,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_test_canonicalize_var_v2f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 845b25a8f61bd7..7d8f43bbe16b73 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @fcmp_f16_lt( ; SI-LABEL: fcmp_f16_lt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -32,33 +32,33 @@ define amdgpu_kernel void @fcmp_f16_lt( ; ; VI-LABEL: fcmp_f16_lt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -95,8 +95,8 @@ entry: define amdgpu_kernel void @fcmp_f16_lt_abs( ; SI-LABEL: fcmp_f16_lt_abs: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -121,33 +121,33 @@ define amdgpu_kernel void @fcmp_f16_lt_abs( ; ; VI-LABEL: fcmp_f16_lt_abs: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 -; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], |v0|, |v1| -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_lt_f16_e64 s[4:5], |v0|, |v1| +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lt_abs: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -187,8 +187,8 @@ entry: define amdgpu_kernel void @fcmp_f16_eq( ; SI-LABEL: fcmp_f16_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -213,33 +213,33 @@ define amdgpu_kernel void @fcmp_f16_eq( ; ; VI-LABEL: fcmp_f16_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_eq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -276,8 +276,8 @@ entry: define amdgpu_kernel void @fcmp_f16_le( ; SI-LABEL: fcmp_f16_le: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -302,33 +302,33 @@ define amdgpu_kernel void @fcmp_f16_le( ; ; VI-LABEL: fcmp_f16_le: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_le_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_le: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -365,8 +365,8 @@ entry: define amdgpu_kernel void @fcmp_f16_gt( ; SI-LABEL: fcmp_f16_gt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -391,33 +391,33 @@ define amdgpu_kernel void @fcmp_f16_gt( ; ; VI-LABEL: fcmp_f16_gt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_gt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -454,8 +454,8 @@ entry: define amdgpu_kernel void @fcmp_f16_lg( ; SI-LABEL: fcmp_f16_lg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -480,33 +480,33 @@ define amdgpu_kernel void @fcmp_f16_lg( ; ; VI-LABEL: fcmp_f16_lg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -543,8 +543,8 @@ entry: define amdgpu_kernel void @fcmp_f16_ge( ; SI-LABEL: fcmp_f16_ge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -569,33 +569,33 @@ define amdgpu_kernel void @fcmp_f16_ge( ; ; VI-LABEL: fcmp_f16_ge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_ge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -632,8 +632,8 @@ entry: define amdgpu_kernel void @fcmp_f16_o( ; SI-LABEL: fcmp_f16_o: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -658,33 +658,33 @@ define amdgpu_kernel void @fcmp_f16_o( ; ; VI-LABEL: fcmp_f16_o: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_o: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -721,8 +721,8 @@ entry: define amdgpu_kernel void @fcmp_f16_u( ; SI-LABEL: fcmp_f16_u: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -747,33 +747,33 @@ define amdgpu_kernel void @fcmp_f16_u( ; ; VI-LABEL: fcmp_f16_u: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_u_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_u: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -810,8 +810,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nge( ; SI-LABEL: fcmp_f16_nge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -836,33 +836,33 @@ define amdgpu_kernel void @fcmp_f16_nge( ; ; VI-LABEL: fcmp_f16_nge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -899,8 +899,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nlg( ; SI-LABEL: fcmp_f16_nlg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -925,33 +925,33 @@ define amdgpu_kernel void @fcmp_f16_nlg( ; ; VI-LABEL: fcmp_f16_nlg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nlg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -988,8 +988,8 @@ entry: define amdgpu_kernel void @fcmp_f16_ngt( ; SI-LABEL: fcmp_f16_ngt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1014,33 +1014,33 @@ define amdgpu_kernel void @fcmp_f16_ngt( ; ; VI-LABEL: fcmp_f16_ngt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_ngt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1077,8 +1077,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nle( ; SI-LABEL: fcmp_f16_nle: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1103,33 +1103,33 @@ define amdgpu_kernel void @fcmp_f16_nle( ; ; VI-LABEL: fcmp_f16_nle: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nle: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1166,8 +1166,8 @@ entry: define amdgpu_kernel void @fcmp_f16_neq( ; SI-LABEL: fcmp_f16_neq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1192,33 +1192,33 @@ define amdgpu_kernel void @fcmp_f16_neq( ; ; VI-LABEL: fcmp_f16_neq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_neq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1255,8 +1255,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nlt( ; SI-LABEL: fcmp_f16_nlt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1281,33 +1281,33 @@ define amdgpu_kernel void @fcmp_f16_nlt( ; ; VI-LABEL: fcmp_f16_nlt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nlt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1344,8 +1344,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_lt( ; SI-LABEL: fcmp_v2f16_lt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1376,21 +1376,21 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; ; VI-LABEL: fcmp_v2f16_lt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1399,14 +1399,14 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_lt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1449,8 +1449,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_eq( ; SI-LABEL: fcmp_v2f16_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1481,21 +1481,21 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; ; VI-LABEL: fcmp_v2f16_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1504,14 +1504,14 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_eq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1553,8 +1553,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_le( ; SI-LABEL: fcmp_v2f16_le: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1585,21 +1585,21 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; ; VI-LABEL: fcmp_v2f16_le: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1608,14 +1608,14 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_le_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_le: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1657,8 +1657,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_gt( ; SI-LABEL: fcmp_v2f16_gt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1689,21 +1689,21 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; ; VI-LABEL: fcmp_v2f16_gt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1712,14 +1712,14 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_gt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1762,8 +1762,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_lg( ; SI-LABEL: fcmp_v2f16_lg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1794,21 +1794,21 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; ; VI-LABEL: fcmp_v2f16_lg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1817,14 +1817,14 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_lg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1867,8 +1867,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_ge( ; SI-LABEL: fcmp_v2f16_ge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1899,21 +1899,21 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; ; VI-LABEL: fcmp_v2f16_ge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1922,14 +1922,14 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_ge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1972,8 +1972,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_o( ; SI-LABEL: fcmp_v2f16_o: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2004,21 +2004,21 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; ; VI-LABEL: fcmp_v2f16_o: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2027,14 +2027,14 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_o: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2077,8 +2077,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_u( ; SI-LABEL: fcmp_v2f16_u: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2109,21 +2109,21 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; ; VI-LABEL: fcmp_v2f16_u: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2132,14 +2132,14 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_u_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_u: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2181,8 +2181,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nge( ; SI-LABEL: fcmp_v2f16_nge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2213,21 +2213,21 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; ; VI-LABEL: fcmp_v2f16_nge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2236,14 +2236,14 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2285,8 +2285,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nlg( ; SI-LABEL: fcmp_v2f16_nlg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2317,21 +2317,21 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; ; VI-LABEL: fcmp_v2f16_nlg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2340,14 +2340,14 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nlg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2390,8 +2390,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_ngt( ; SI-LABEL: fcmp_v2f16_ngt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2422,21 +2422,21 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; ; VI-LABEL: fcmp_v2f16_ngt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2445,14 +2445,14 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_ngt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2494,8 +2494,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nle( ; SI-LABEL: fcmp_v2f16_nle: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2526,21 +2526,21 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; ; VI-LABEL: fcmp_v2f16_nle: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2549,14 +2549,14 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nle: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2598,8 +2598,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_neq( ; SI-LABEL: fcmp_v2f16_neq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2630,21 +2630,21 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; ; VI-LABEL: fcmp_v2f16_neq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2653,14 +2653,14 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_neq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2702,8 +2702,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nlt( ; SI-LABEL: fcmp_v2f16_nlt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2734,21 +2734,21 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; ; VI-LABEL: fcmp_v2f16_nlt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2757,14 +2757,14 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nlt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index bd483f4c070713..eda1709e4fd595 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -15,31 +15,30 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, half %sign) { ; SI-LABEL: s_copysign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_copysign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_movk_i32 s3, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s4, 16 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_bfi_b32 v2, s3, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -47,29 +46,29 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, ; ; GFX9-LABEL: s_copysign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -82,8 +81,8 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -94,10 +93,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; ; VI-LABEL: s_test_copysign_f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -106,22 +105,22 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -136,8 +135,8 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -148,10 +147,10 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; ; VI-LABEL: s_test_copysign_f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -160,22 +159,22 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -190,8 +189,8 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -202,10 +201,10 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -214,22 +213,22 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_10.0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -244,8 +243,8 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -256,10 +255,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x8000 +; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -268,22 +267,22 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_or_b32 s0, s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -298,8 +297,8 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -310,10 +309,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x8000 +; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -322,22 +321,22 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg10: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_or_b32 s0, s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -352,26 +351,25 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s4, v0 +; VI-NEXT: v_and_b32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -379,23 +377,23 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_0_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -409,26 +407,25 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -437,24 +434,24 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -469,27 +466,26 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mov_b32_e32 v1, 0x41200000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -498,24 +494,24 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; ; GFX9-LABEL: s_test_copysign_f16_10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -530,26 +526,25 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, -1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -558,24 +553,24 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; ; GFX9-LABEL: s_test_copysign_f16_neg1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -590,27 +585,26 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mov_b32_e32 v1, 0xc1200000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -619,24 +613,24 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_test_copysign_f16_neg10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -829,8 +823,8 @@ define half @v_test_copysign_f16_neg10(half %mag) { define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -855,8 +849,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -880,17 +874,17 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] @@ -899,10 +893,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -931,8 +923,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -957,8 +949,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -982,15 +974,15 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v1, s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 @@ -1001,12 +993,9 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v1, s[6:7] @@ -1035,8 +1024,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1060,8 +1049,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1085,14 +1074,14 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[0:1] ; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1104,10 +1093,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1136,8 +1123,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1163,8 +1150,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1188,14 +1175,14 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v1, s[0:1] ; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -1207,12 +1194,10 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v1, s[4:5] ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -1239,8 +1224,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1266,8 +1251,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1291,14 +1276,14 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v1, s[0:1] ; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -1310,10 +1295,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1342,35 +1325,35 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -1392,15 +1375,15 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s0, v2, v1 @@ -1410,12 +1393,10 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1442,8 +1423,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1471,8 +1452,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1496,17 +1477,17 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v1, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] -; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_short v2, v0, s[4:5] @@ -1515,10 +1496,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1547,8 +1526,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) %arg_out, double %mag, half %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: s_lshr_b32 s4, s3, 8 @@ -1611,8 +1590,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s0, s7, 8 ; VI-NEXT: s_and_b32 s1, s7, 0x1ff @@ -1669,8 +1648,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s7, 8 @@ -1727,8 +1706,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff ; GFX11-NEXT: s_lshr_b32 s2, s7, 8 @@ -1798,7 +1777,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half> %arg_mag, <2 x half> %arg_sign) { ; SI-LABEL: s_copysign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1822,7 +1801,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; VI-LABEL: s_copysign_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s4, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1842,7 +1821,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX9-LABEL: s_copysign_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1861,7 +1840,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX11-LABEL: s_copysign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s3 @@ -1887,8 +1866,8 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) { ; SI-LABEL: s_copysign_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s4, 16 @@ -1915,8 +1894,8 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; ; VI-LABEL: s_copysign_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -1944,33 +1923,33 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; ; GFX9-LABEL: s_copysign_v3f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_lshr_b32 s3, s6, 16 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 -; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 +; GFX9-NEXT: global_store_short v0, v2, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s2, s6, 16 @@ -1999,8 +1978,8 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) { ; SI-LABEL: s_copysign_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2034,8 +2013,8 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; ; VI-LABEL: s_copysign_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 @@ -2065,39 +2044,39 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; ; GFX9-LABEL: s_copysign_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s3, s7, 16 +; GFX9-NEXT: s_lshr_b32 s1, s7, 16 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: s_lshr_b32 s3, s6, 16 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_bfi_b32 v3, s2, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_bfi_b32 v3, s0, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-NEXT: v_mov_b32_e32 v1, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index 542d67486e7580..f48961c905f58f 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag, float %sign) { ; SI-LABEL: s_test_copysign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; VI-LABEL: s_test_copysign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -34,7 +34,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; GFX11-LABEL: s_test_copysign_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -51,8 +51,8 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -63,10 +63,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; ; VI-LABEL: s_test_copysign_f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -76,10 +76,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -94,8 +94,8 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -106,10 +106,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; ; VI-LABEL: s_test_copysign_f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -119,10 +119,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -137,8 +137,8 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,10 +149,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; ; VI-LABEL: s_test_copysign_f32_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -162,10 +162,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -180,8 +180,8 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -192,10 +192,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; ; VI-LABEL: s_test_copysign_f32_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -205,10 +205,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_bitset1_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -223,8 +223,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,10 +235,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -248,10 +248,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_bitset1_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -266,8 +266,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -278,10 +278,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -291,10 +291,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -310,8 +310,8 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -323,10 +323,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -337,10 +337,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -356,8 +356,8 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -369,10 +369,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; ; VI-LABEL: s_test_copysign_f32_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -383,10 +383,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; GFX11-LABEL: s_test_copysign_f32_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -402,8 +402,8 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -415,10 +415,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; ; VI-LABEL: s_test_copysign_f32_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -429,10 +429,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; GFX11-LABEL: s_test_copysign_f32_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -448,8 +448,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -461,10 +461,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -475,10 +475,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; GFX11-LABEL: s_test_copysign_f32_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -494,8 +494,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) { ; SI-LABEL: s_test_copysign_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -511,8 +511,8 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: s_test_copysign_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 @@ -529,8 +529,8 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo ; GFX11-LABEL: s_test_copysign_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 @@ -549,40 +549,40 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x float> %mag, <3 x float> %sign) { ; SI-LABEL: s_test_copysign_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_brev_b32 s7, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 +; SI-NEXT: v_bfi_b32 v0, s7, v0, v2 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_bfi_b32 v2, s0, v2, v3 -; SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: v_bfi_b32 v2, s7, v2, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s7, -2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_bfi_b32 v2, s7, v0, v1 +; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_bfi_b32 v1, s7, v3, v0 +; VI-NEXT: v_bfi_b32 v1, s2, v3, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v3, s8 -; VI-NEXT: v_bfi_b32 v0, s7, v0, v3 +; VI-NEXT: v_bfi_b32 v0, s2, v0, v3 ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] @@ -591,8 +591,8 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo ; GFX11-LABEL: s_test_copysign_v3f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9 @@ -614,45 +614,45 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x float> %mag, <4 x float> %sign) { ; SI-LABEL: s_test_copysign_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_brev_b32 s12, -2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_bfi_b32 v2, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v2, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: v_bfi_b32 v0, s12, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_brev_b32 s12, -2 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_bfi_b32 v3, s12, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_bfi_b32 v2, s12, v2, v0 +; VI-NEXT: v_bfi_b32 v2, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_bfi_b32 v1, s12, v0, v1 +; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_bfi_b32 v0, s12, v0, v4 +; VI-NEXT: v_bfi_b32 v0, s2, v0, v4 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -661,8 +661,8 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo ; GFX11-LABEL: s_test_copysign_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10 @@ -906,46 +906,46 @@ define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) { define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out, float %mag, double %sign) { ; SI-LABEL: s_test_copysign_f32_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s0, -2 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfi_b32 v2, s0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -958,7 +958,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -972,7 +972,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; VI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s3, 0x80000000 @@ -984,7 +984,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1003,7 +1003,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, float %mag, half %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1018,7 +1018,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1050,24 +1050,23 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f32_1_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: v_or_b32_e32 v0, 1.0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_1_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 16 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1079,10 +1078,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out ; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s4, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 @@ -1101,7 +1100,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, float %mag, bfloat %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,7 +1116,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1130,7 +1129,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index 4300faa02742a3..b5fa3fd9eccc13 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -11,49 +11,49 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) #0 define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) { ; SI-LABEL: s_test_copysign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x1d +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s6, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_brev_b32 s4, -2 -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_bfi_b32 v1, s4, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x74 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -65,8 +65,8 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -78,28 +78,28 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32 ; ; VI-LABEL: s_test_copysign_f64_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset0_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s1, 31 +; GFX11-NEXT: s_bitset0_b32 s3, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -111,8 +111,8 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -124,28 +124,28 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32 ; ; VI-LABEL: s_test_copysign_f64_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset0_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s1, 31 +; GFX11-NEXT: s_bitset0_b32 s3, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -157,8 +157,8 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -170,28 +170,28 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: s_test_copysign_f64_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset0_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s1, 31 +; GFX11-NEXT: s_bitset0_b32 s3, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -203,8 +203,8 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -216,28 +216,28 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: s_test_copysign_f64_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset1_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s1, 31 +; GFX11-NEXT: s_bitset1_b32 s3, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -249,8 +249,8 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -262,28 +262,28 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: s_test_copysign_f64_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset1_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s1, 31 +; GFX11-NEXT: s_bitset1_b32 s3, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -295,49 +295,49 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], float %sign) { ; SI-LABEL: s_test_copysign_f64_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dword s6, s[2:3], 0x1d -; SI-NEXT: s_brev_b32 s7, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dword s0, s[0:1], 0x1d +; SI-NEXT: s_brev_b32 s1, -2 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_bfi_b32 v1, s1, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dword s4, s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dword s4, s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s5, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_bfi_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x74 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -350,49 +350,49 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], half %sign) { ; SI-LABEL: s_test_copysign_f64_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[2:3], 0x1d -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s2, s[0:1], 0x1d +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_brev_b32 s6, -2 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfi_b32 v1, s6, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_bfi_b32 v1, s2, v1, v0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s5, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_bfi_b32 v1, s5, v1, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x74 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -405,7 +405,7 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -419,7 +419,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub ; ; VI-LABEL: s_test_copysign_f64_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -431,7 +431,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub ; ; GFX11-LABEL: s_test_copysign_f64_0_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -448,7 +448,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -463,7 +463,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub ; ; VI-LABEL: s_test_copysign_f64_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -476,7 +476,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub ; ; GFX11-LABEL: s_test_copysign_f64_1_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -494,7 +494,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -509,7 +509,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou ; ; VI-LABEL: s_test_copysign_f64_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -522,7 +522,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou ; ; GFX11-LABEL: s_test_copysign_f64_10_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -540,7 +540,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -555,7 +555,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d ; ; VI-LABEL: s_test_copysign_f64_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -568,7 +568,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d ; ; GFX11-LABEL: s_test_copysign_f64_neg1_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -586,7 +586,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -601,7 +601,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f64_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -614,7 +614,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f64_neg10_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -632,36 +632,36 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) { ; SI-LABEL: s_test_copysign_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_brev_b32 s8, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s8, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s8, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s8, -2 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s8, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_bfi_b32 v1, s8, v2, v0 +; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -671,8 +671,8 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou ; GFX11-LABEL: s_test_copysign_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s11 ; GFX11-NEXT: v_mov_b32_e32 v2, s9 @@ -693,46 +693,46 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) { ; SI-LABEL: s_test_copysign_v3f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_brev_b32 s10, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s10, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s10, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s0, v0, v2 +; SI-NEXT: v_bfi_b32 v5, s10, v0, v2 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[20:23], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s10, -2 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s10, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_bfi_b32 v1, s10, v2, v0 +; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v2, s17 +; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_bfi_b32 v5, s10, v0, v2 ; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] @@ -746,8 +746,8 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou ; GFX11-LABEL: s_test_copysign_v3f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s15 ; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s4 @@ -771,53 +771,54 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) { ; SI-LABEL: s_test_copysign_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_brev_b32 s12, -2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v2, s19 -; SI-NEXT: v_bfi_b32 v7, s0, v0, v2 +; SI-NEXT: v_bfi_b32 v7, s12, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s0, v0, v2 +; SI-NEXT: v_bfi_b32 v5, s12, v0, v2 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s12, -2 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s15 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s12, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: v_bfi_b32 v1, s12, v2, v0 +; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v2, s19 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_bfi_b32 v7, s12, v0, v2 +; VI-NEXT: v_bfi_b32 v7, s2, v0, v2 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v2, s17 +; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_bfi_b32 v5, s12, v0, v2 ; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v6, s10 ; VI-NEXT: v_mov_b32_e32 v8, s2 @@ -832,8 +833,8 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou ; GFX11-LABEL: s_test_copysign_v4f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s15 ; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index f53d3cf33c9cc8..b14b6421f56b4e 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -10,20 +10,20 @@ define amdgpu_kernel void @v_fdiv_f16( ; SI-LABEL: v_fdiv_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, v2 @@ -46,8 +46,8 @@ define amdgpu_kernel void @v_fdiv_f16( ; ; GFX8-LABEL: v_fdiv_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -74,13 +74,13 @@ define amdgpu_kernel void @v_fdiv_f16( ; ; GFX9-LABEL: v_fdiv_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 @@ -92,13 +92,13 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX10-LABEL: v_fdiv_f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 @@ -110,10 +110,8 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX11-LABEL: v_fdiv_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc @@ -149,7 +147,7 @@ entry: define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -180,7 +178,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX8-LABEL: v_rcp_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -197,7 +195,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX9-LABEL: v_rcp_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -208,7 +206,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX10-LABEL: v_rcp_f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -219,9 +217,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX11-LABEL: v_rcp_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -245,7 +241,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_abs: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -276,7 +272,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_abs: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -293,7 +289,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_abs: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -304,7 +300,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rcp_f16_abs: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -315,9 +311,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rcp_f16_abs: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -344,7 +338,7 @@ entry: define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: reciprocal_f16_rounded: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -375,7 +369,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX8-LABEL: reciprocal_f16_rounded: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -392,7 +386,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX9-LABEL: reciprocal_f16_rounded: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -403,7 +397,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX10-LABEL: reciprocal_f16_rounded: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -414,9 +408,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX11-LABEL: reciprocal_f16_rounded: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -440,7 +432,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_afn: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -458,7 +450,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_afn: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -475,7 +467,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_afn: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -486,7 +478,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rcp_f16_afn: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -497,9 +489,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rcp_f16_afn: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -523,7 +513,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_neg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -554,7 +544,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_neg: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -571,7 +561,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_neg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -582,7 +572,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rcp_f16_neg: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -593,9 +583,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rcp_f16_neg: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -619,7 +607,7 @@ entry: define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -653,7 +641,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX8-LABEL: v_rsq_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -670,7 +658,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX9-LABEL: v_rsq_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -681,7 +669,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX10-LABEL: v_rsq_f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -692,9 +680,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX11-LABEL: v_rsq_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -719,7 +705,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_neg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -753,7 +739,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rsq_f16_neg: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -771,7 +757,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rsq_f16_neg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -783,7 +769,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rsq_f16_neg: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -795,9 +781,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rsq_f16_neg: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -824,7 +808,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_multi_use: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -860,7 +844,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX8-LABEL: v_rsq_f16_multi_use: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -879,7 +863,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX9-LABEL: v_rsq_f16_multi_use: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -892,7 +876,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX10-LABEL: v_rsq_f16_multi_use: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -905,9 +889,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX11-LABEL: v_rsq_f16_multi_use: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -935,7 +917,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_missing_contract0: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -969,7 +951,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX8-LABEL: v_rsq_f16_missing_contract0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -987,7 +969,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX9-LABEL: v_rsq_f16_missing_contract0: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -999,7 +981,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX10-LABEL: v_rsq_f16_missing_contract0: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1011,9 +993,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX11-LABEL: v_rsq_f16_missing_contract0: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1040,7 +1020,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_missing_contract1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1074,7 +1054,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX8-LABEL: v_rsq_f16_missing_contract1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1092,7 +1072,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX9-LABEL: v_rsq_f16_missing_contract1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -1104,7 +1084,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX10-LABEL: v_rsq_f16_missing_contract1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1116,9 +1096,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX11-LABEL: v_rsq_f16_missing_contract1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1145,7 +1123,7 @@ entry: define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_neg_rsq_f16_missing_contract1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1179,7 +1157,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX8-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1197,7 +1175,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX9-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -1209,7 +1187,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX10-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1221,9 +1199,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX11-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1250,20 +1226,20 @@ entry: define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_fdiv_f16_afn: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_rcp_f32_e32 v3, v3 @@ -1274,8 +1250,8 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; ; GFX8-LABEL: v_fdiv_f16_afn: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1298,13 +1274,13 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; ; GFX9-LABEL: v_fdiv_f16_afn: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v2, v2 ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1314,13 +1290,13 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; GFX10-LABEL: v_fdiv_f16_afn: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v2, v2 ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1330,10 +1306,8 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; GFX11-LABEL: v_fdiv_f16_afn: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc @@ -1363,20 +1337,20 @@ entry: define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #2 { ; SI-LABEL: v_fdiv_f16_unsafe: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_rcp_f32_e32 v3, v3 @@ -1387,8 +1361,8 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; ; GFX8-LABEL: v_fdiv_f16_unsafe: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1411,13 +1385,13 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; ; GFX9-LABEL: v_fdiv_f16_unsafe: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v2, v2 ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1427,13 +1401,13 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; GFX10-LABEL: v_fdiv_f16_unsafe: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v2, v2 ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1443,10 +1417,8 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; GFX11-LABEL: v_fdiv_f16_unsafe: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc @@ -1476,7 +1448,7 @@ entry: define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_2_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1491,7 +1463,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_2_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0.5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1503,7 +1475,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_2_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1514,7 +1486,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_2_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1525,7 +1497,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_2_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1543,7 +1515,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_k_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1558,7 +1530,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_k_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0x2e66, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1570,7 +1542,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_k_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1581,7 +1553,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_k_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1592,7 +1564,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_k_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1610,7 +1582,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_neg_k_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1625,7 +1597,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_neg_k_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0xae66, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1637,7 +1609,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_neg_k_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0xae66, v0 @@ -1648,7 +1620,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_neg_k_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0xae66, v0 @@ -1659,7 +1631,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_neg_k_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index c6b730e3fd5d6f..0468175c5df50d 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -42,7 +42,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -68,7 +68,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ninf: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -94,7 +94,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -118,7 +118,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX11-LABEL: s_fdiv_f32_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -181,7 +181,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -205,7 +205,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -229,7 +229,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ieee: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -253,7 +253,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ieee: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ieee: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 @@ -294,7 +294,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX11-LABEL: s_fdiv_f32_ieee: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 @@ -334,7 +334,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_25ulp_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX67-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX8-LABEL: s_fdiv_25ulp_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX10-LABEL: s_fdiv_25ulp_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| @@ -384,7 +384,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX11-LABEL: s_fdiv_25ulp_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| @@ -420,7 +420,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -446,7 +446,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX7-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -465,7 +465,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX8-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_frexp_mant_f32_e32 v1, s3 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1 @@ -482,7 +482,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s3 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 @@ -498,7 +498,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_frexp_mant_f32_e32 v0, s3 ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 @@ -535,7 +535,7 @@ entry: define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX67-LABEL: s_fdiv_fast_ieee_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -548,7 +548,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_fast_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -559,7 +559,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_fast_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -569,7 +569,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_fast_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -599,7 +599,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -612,7 +612,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -623,7 +623,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -633,7 +633,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -663,7 +663,7 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -676,7 +676,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX8-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -687,7 +687,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -697,7 +697,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -727,7 +727,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -753,7 +753,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -779,7 +779,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX7-LABEL: s_fdiv_f32_arcp_daz: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -805,7 +805,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_daz: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -829,7 +829,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_daz: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -850,7 +850,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_arcp_daz: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -892,7 +892,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_arcp_ninf: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -905,7 +905,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -916,7 +916,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -926,7 +926,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_arcp_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -956,8 +956,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v2f32: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -996,10 +996,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s4 @@ -1012,11 +1013,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2 @@ -1031,14 +1031,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s6, v4 -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_v2f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1077,10 +1076,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX8-LABEL: s_fdiv_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 +; GFX8-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 @@ -1093,11 +1093,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v5, v2 ; GFX8-NEXT: v_div_fixup_f32 v1, v1, s7, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1109,7 +1108,6 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_div_fixup_f32 v0, v0, s6, v4 @@ -1118,10 +1116,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX10-LABEL: s_fdiv_v2f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s5 +; GFX10-NEXT: v_div_scale_f32 v0, s2, s7, s7, s5 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -1131,9 +1130,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s0, s6, s6, s4 +; GFX10-NEXT: v_div_scale_f32 v2, s2, s6, s6, s4 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s5 ; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4 @@ -1155,8 +1153,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-LABEL: s_fdiv_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s5 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 @@ -1214,8 +1212,8 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_v2f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1228,8 +1226,8 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> ; ; GFX8-LABEL: s_fdiv_ulp25_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s6 ; GFX8-NEXT: v_rcp_f32_e32 v1, s7 @@ -1243,22 +1241,22 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> ; GFX10-LABEL: s_fdiv_ulp25_v2f32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, s7 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s6 ; GFX11-NEXT: v_rcp_f32_e32 v1, s7 @@ -1292,8 +1290,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1306,8 +1304,8 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; ; GFX8-LABEL: s_fdiv_v2f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s7 ; GFX8-NEXT: v_rcp_f32_e32 v2, s6 @@ -1321,22 +1319,22 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; GFX10-LABEL: s_fdiv_v2f32_fast_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 ; GFX10-NEXT: v_rcp_f32_e32 v2, s6 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_fast_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: v_rcp_f32_e32 v2, s6 @@ -1370,8 +1368,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_arcp_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1384,8 +1382,8 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; ; GFX8-LABEL: s_fdiv_v2f32_arcp_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s7 ; GFX8-NEXT: v_rcp_f32_e32 v2, s6 @@ -1399,22 +1397,22 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; GFX10-LABEL: s_fdiv_v2f32_arcp_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 ; GFX10-NEXT: v_rcp_f32_e32 v2, s6 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_arcp_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: v_rcp_f32_e32 v2, s6 @@ -1448,7 +1446,7 @@ entry: define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v4f32: ; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-FASTFMA-NEXT: s_mov_b32 s11, 0xf000 @@ -1519,7 +1517,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v4f32: ; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -1590,7 +1588,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX7-LABEL: s_fdiv_v4f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 @@ -1661,7 +1659,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: s_fdiv_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1732,7 +1730,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: s_fdiv_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1794,7 +1792,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: s_fdiv_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1895,7 +1893,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_fast_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -1914,7 +1912,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_fast_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -1933,7 +1931,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_fast_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1951,7 +1949,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_fast_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2003,7 +2001,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_arcp_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -2022,7 +2020,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_arcp_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -2041,7 +2039,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_arcp_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -2059,7 +2057,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_arcp_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2111,8 +2109,8 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2134,11 +2132,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 @@ -2152,14 +2150,13 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2181,11 +2178,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX8-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 @@ -2197,7 +2194,6 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2205,11 +2201,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX10-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -2221,7 +2217,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2229,11 +2225,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -2246,7 +2242,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2271,8 +2267,8 @@ entry: define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2292,11 +2288,11 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 @@ -2308,14 +2304,13 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2335,11 +2330,11 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX8-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 ; GFX8-NEXT: v_fma_f32 v2, v3, v2, v2 @@ -2349,7 +2344,6 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2357,21 +2351,21 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX10-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2379,22 +2373,22 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX11-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX11-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll index 8e43bd890a8fa4..c56b4ae3c34f5d 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -37,7 +37,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -52,7 +52,7 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -69,7 +69,7 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -102,7 +102,7 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -135,7 +135,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -165,7 +165,7 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -185,7 +185,7 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_minus_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -215,7 +215,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_minus_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -235,7 +235,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -265,7 +265,7 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -286,7 +286,7 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_minus_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -316,7 +316,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg ; ; GCN-FLUSH-LABEL: div_v4_minus_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -337,7 +337,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_c_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -369,7 +369,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_c_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 @@ -401,7 +401,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_c_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -433,7 +433,7 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_c_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 @@ -468,40 +468,40 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) { ; GCN-DENORM-LABEL: div_v_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0 ; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s4 ; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v3, s4 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s2 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s0 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v4, s2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v4, s0 ; GCN-DENORM-NEXT: v_sub_u32_e32 v2, v2, v4 ; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v3, v1 ; GCN-DENORM-NEXT: v_ldexp_f32 v1, v1, v2 -; GCN-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-DENORM-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 ; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s2, v0 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s0, v0 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s4, v1 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-FLUSH-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-FLUSH-NEXT: global_store_dword v2, v0, s[2:3] ; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %div = fdiv float %num, %load, !fpmath !0 @@ -512,7 +512,7 @@ define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) { define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) { ; GCN-LABEL: div_1_by_x_fast: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -529,7 +529,7 @@ define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_fast: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -540,7 +540,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_fast: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -558,7 +558,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; GCN-LABEL: div_1_by_minus_x_fast: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -576,7 +576,7 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_fast: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -587,7 +587,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_fast: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -606,7 +606,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -627,7 +627,7 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -656,7 +656,7 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -677,7 +677,7 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) % ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -706,7 +706,7 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) % define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_minus_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -727,7 +727,7 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) % ; ; GCN-FLUSH-LABEL: div_1_by_minus_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -757,7 +757,7 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) % define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -778,7 +778,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspac ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index 431b7d5400f430..b9583a73295e26 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -7,10 +7,10 @@ declare void @extern_func() #0 define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { ; FLAT_SCR_OPT-LABEL: stack_object_addrspacecast_in_kernel_no_calls: ; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; FLAT_SCR_OPT-NEXT: s_add_u32 s0, s0, s3 +; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, 0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; FLAT_SCR_OPT-NEXT: s_mov_b64 s[0:1], src_private_base ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, s1 @@ -37,10 +37,10 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { define amdgpu_kernel void @stack_object_in_kernel_no_calls() { ; FLAT_SCR_OPT-LABEL: stack_object_in_kernel_no_calls: ; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; FLAT_SCR_OPT-NEXT: s_add_u32 s0, s0, s3 +; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, 0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 0 ; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s0 @@ -110,22 +110,22 @@ define amdgpu_kernel void @kernel_calls_no_stack() { define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_OPT-LABEL: test: ; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; FLAT_SCR_OPT-NEXT: s_add_u32 s2, s2, s5 +; FLAT_SCR_OPT-NEXT: s_addc_u32 s3, s3, 0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; FLAT_SCR_OPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s0, 0 -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s1, 1 +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s2, 0 +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s3, 1 ; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 0 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill +; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 0 +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 ; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[2:3], 0x8 -; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr2_sgpr3 +; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 +; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr0_sgpr1 ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART @@ -237,18 +237,18 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; ; FLAT_SCR_ARCH-LABEL: test: ; FLAT_SCR_ARCH: ; %bb.0: -; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; FLAT_SCR_ARCH-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s0, 0 -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s1, 1 +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s2, 0 +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s3, 1 ; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 0 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 0 +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 ; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 -; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[2:3], 0x8 -; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr2_sgpr3 +; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 +; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr0_sgpr1 ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 087d38ce7b0046..0af57c6a97db5c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -14,19 +14,18 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -36,18 +35,17 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 @@ -57,13 +55,12 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc @@ -76,13 +73,12 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc @@ -95,9 +91,9 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -110,11 +106,10 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -142,19 +137,18 @@ bb: define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -164,8 +158,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -186,15 +179,14 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -206,15 +198,14 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -226,11 +217,9 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -243,12 +232,11 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -276,19 +264,18 @@ bb: define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -298,8 +285,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -320,15 +306,14 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -340,15 +325,14 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -360,11 +344,9 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -377,12 +359,11 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -410,20 +391,19 @@ bb: define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -433,19 +413,18 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 @@ -455,15 +434,13 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc @@ -476,15 +453,13 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc @@ -497,9 +472,9 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -514,12 +489,12 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -547,9 +522,8 @@ bb: define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 @@ -569,8 +543,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -592,17 +565,16 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -613,17 +585,16 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -635,13 +606,12 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -653,14 +623,12 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -688,9 +656,8 @@ bb: define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 @@ -710,8 +677,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -733,17 +699,16 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -754,17 +719,16 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -776,13 +740,12 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -794,14 +757,12 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -829,20 +790,19 @@ bb: define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -852,19 +812,18 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 @@ -874,15 +833,13 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc @@ -895,15 +852,13 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc @@ -916,9 +871,9 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -933,12 +888,12 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -966,9 +921,8 @@ bb: define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 @@ -988,8 +942,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -1011,17 +964,16 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -1032,17 +984,16 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1054,13 +1005,12 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -1072,14 +1022,12 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -1107,18 +1055,17 @@ bb: define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, 2 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 ; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v3, off offset:2 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -1128,8 +1075,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -1151,14 +1097,12 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0 @@ -1172,17 +1116,16 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1194,13 +1137,12 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -1212,14 +1154,12 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 14d8b71c5167a2..850be72f06c7d0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -13,13 +13,13 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-LABEL: zero_init_kernel: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_mov_b32 s2, s0 ; GFX9-NEXT: s_mov_b32 s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -31,10 +31,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX10-LABEL: zero_init_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 @@ -83,18 +83,18 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX9-PAL-LABEL: zero_init_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 -; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 @@ -120,15 +120,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 @@ -145,15 +145,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 @@ -374,9 +374,9 @@ define void @zero_init_foo() { define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 @@ -392,11 +392,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 @@ -412,7 +412,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -428,7 +428,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 @@ -444,15 +444,15 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: s_add_i32 s1, s1, 0 @@ -466,7 +466,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 @@ -482,16 +482,16 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-PAL-LABEL: store_load_sindex_kernel: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX10-PAL-NEXT: s_mov_b32 s10, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX10-PAL-NEXT: s_mov_b32 s4, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX10-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -507,7 +507,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -523,7 +523,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -707,9 +707,9 @@ bb: define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_add_u32_e32 v1, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: scratch_store_dword v1, v2, off @@ -721,10 +721,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-LABEL: store_load_vindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0, v0 @@ -738,8 +738,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -750,8 +749,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-LABEL: store_load_vindex_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -761,17 +759,17 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc @@ -781,7 +779,6 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX940-LABEL: store_load_vindex_kernel: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -792,15 +789,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-PAL-LABEL: store_load_vindex_kernel: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX10-PAL-NEXT: s_mov_b32 s10, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 0, v0 @@ -814,8 +811,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-PAL-LABEL: store_load_vindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -826,8 +822,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-PAL-LABEL: store_load_vindex_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -1068,8 +1063,8 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-LABEL: zero_init_small_offset_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1088,10 +1083,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX10-LABEL: zero_init_small_offset_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 @@ -1146,19 +1141,19 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX9-PAL-LABEL: zero_init_small_offset_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_mov_b32 s3, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -1187,15 +1182,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1214,15 +1209,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 @@ -1473,9 +1468,9 @@ define void @zero_init_small_offset_foo() { define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1493,11 +1488,11 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -1515,7 +1510,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -1533,7 +1528,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -1551,17 +1546,17 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 -; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 glc -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1576,7 +1571,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -1594,16 +1589,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1010-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1622,16 +1617,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1030-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1649,7 +1644,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1667,7 +1662,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1905,8 +1900,8 @@ bb: define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-LABEL: store_load_vindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1922,10 +1917,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off glc dlc @@ -1943,8 +1938,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1957,8 +1950,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -1968,16 +1959,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x100, v0 @@ -1993,7 +1984,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:256 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2004,15 +1994,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 @@ -2028,15 +2018,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off glc dlc @@ -2054,8 +2044,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2068,8 +2056,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -2260,8 +2246,8 @@ bb: define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-LABEL: zero_init_large_offset_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2281,10 +2267,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX10-LABEL: zero_init_large_offset_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 @@ -2341,19 +2327,19 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX9-PAL-LABEL: zero_init_large_offset_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_mov_b32 s3, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -2384,15 +2370,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2412,15 +2398,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 @@ -2725,9 +2711,9 @@ define void @zero_init_large_offset_foo() { define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2745,11 +2731,11 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -2767,7 +2753,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -2785,7 +2771,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -2803,17 +2789,17 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 -; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2828,7 +2814,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_large_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -2846,16 +2832,16 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1010-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2874,16 +2860,16 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1030-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2901,7 +2887,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2919,7 +2905,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -3157,8 +3143,8 @@ bb: define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-LABEL: store_load_vindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3174,10 +3160,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc @@ -3196,8 +3182,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3210,8 +3194,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -3221,16 +3203,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 @@ -3246,7 +3228,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: s_movk_i32 s0, 0x4004 ; GFX940-NEXT: scratch_store_dword v0, v1, s0 sc0 sc1 @@ -3258,15 +3239,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 @@ -3282,15 +3263,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc @@ -3309,8 +3290,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3323,8 +3302,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -3518,8 +3495,8 @@ bb: define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-LABEL: store_load_large_imm_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 @@ -3535,10 +3512,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3800 @@ -3576,15 +3553,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 @@ -3611,15 +3588,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 @@ -3635,15 +3612,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 @@ -3841,10 +3818,10 @@ bb: define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-LABEL: store_load_vidx_sidx_offset: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 @@ -3857,11 +3834,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-LABEL: store_load_vidx_sidx_offset: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -3874,11 +3851,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, 0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3888,10 +3864,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-LABEL: store_load_vidx_sidx_offset: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -3901,16 +3876,16 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 @@ -3921,8 +3896,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 @@ -3936,16 +3910,16 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX10-PAL-NEXT: s_mov_b32 s10, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX10-PAL-NEXT: s_mov_b32 s4, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX10-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -3958,11 +3932,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3972,10 +3945,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index c9618d43943ef2..e44572985e6d2e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -21,14 +21,14 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -36,11 +36,11 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -55,14 +55,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -70,14 +70,14 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -85,11 +85,11 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -104,14 +104,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -119,14 +119,14 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -134,11 +134,11 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -155,8 +155,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -173,8 +173,8 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_add_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -191,12 +191,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_add_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -214,18 +214,18 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -233,18 +233,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_add_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -252,11 +252,11 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_add_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -276,18 +276,18 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -298,18 +298,18 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -320,11 +320,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -348,12 +348,12 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -361,12 +361,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -374,11 +374,11 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -392,8 +392,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -408,8 +408,8 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -424,12 +424,12 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -446,16 +446,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -463,16 +463,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_add_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -480,11 +480,11 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_add_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -503,16 +503,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -523,16 +523,16 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_add_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -543,11 +543,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_add_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -570,14 +570,14 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -585,14 +585,14 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -600,11 +600,11 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -619,8 +619,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -637,8 +637,8 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_and_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -655,12 +655,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_and_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -678,18 +678,18 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -697,18 +697,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_and_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -716,11 +716,11 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_and_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -740,18 +740,18 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -762,18 +762,18 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -784,11 +784,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -812,12 +812,12 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -825,12 +825,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -838,11 +838,11 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -856,8 +856,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -872,8 +872,8 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -888,12 +888,12 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -910,16 +910,16 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -927,16 +927,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_and_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -944,11 +944,11 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_and_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -967,16 +967,16 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -987,16 +987,16 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_and_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1007,11 +1007,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_and_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1034,14 +1034,14 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1049,14 +1049,14 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1064,11 +1064,11 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1083,8 +1083,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -1101,8 +1101,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_sub_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -1119,12 +1119,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_sub_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1142,18 +1142,18 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1161,18 +1161,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_sub_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1180,11 +1180,11 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_sub_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1204,18 +1204,18 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1226,18 +1226,18 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1248,11 +1248,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1276,12 +1276,12 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1289,12 +1289,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1302,11 +1302,11 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1320,8 +1320,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1336,8 +1336,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1352,12 +1352,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1374,16 +1374,16 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1391,16 +1391,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_sub_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1408,11 +1408,11 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_sub_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1431,16 +1431,16 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1451,16 +1451,16 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1471,11 +1471,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1498,39 +1498,39 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1544,8 +1544,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -1562,8 +1562,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -1580,12 +1580,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -1603,47 +1603,47 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1662,18 +1662,18 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -1684,18 +1684,18 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -1706,11 +1706,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1734,35 +1734,35 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1775,8 +1775,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1791,8 +1791,8 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_max_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1807,12 +1807,12 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_max_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -1829,43 +1829,43 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1883,16 +1883,16 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -1903,16 +1903,16 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -1923,11 +1923,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1950,39 +1950,39 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1996,8 +1996,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2014,8 +2014,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2032,12 +2032,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2055,47 +2055,47 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2114,18 +2114,18 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2136,18 +2136,18 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2158,11 +2158,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2186,35 +2186,35 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2227,8 +2227,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2243,8 +2243,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_umax_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2259,12 +2259,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_umax_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2281,43 +2281,43 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2335,16 +2335,16 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2355,16 +2355,16 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2375,11 +2375,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2402,39 +2402,39 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2448,8 +2448,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2466,8 +2466,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2484,12 +2484,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2507,47 +2507,47 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2566,18 +2566,18 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2588,18 +2588,18 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2610,11 +2610,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2638,35 +2638,35 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2679,8 +2679,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2695,8 +2695,8 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2711,12 +2711,12 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2733,43 +2733,43 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2787,16 +2787,16 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2807,16 +2807,16 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2827,11 +2827,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2854,39 +2854,39 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2900,8 +2900,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2918,8 +2918,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umin_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2936,12 +2936,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umin_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2959,47 +2959,47 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3018,18 +3018,18 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -3040,18 +3040,18 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -3062,11 +3062,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3090,35 +3090,35 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -3131,8 +3131,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3147,8 +3147,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_umin_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3163,12 +3163,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_umin_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -3185,43 +3185,43 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3239,16 +3239,16 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -3259,16 +3259,16 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -3279,11 +3279,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3306,14 +3306,14 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3321,14 +3321,14 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3336,11 +3336,11 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3355,8 +3355,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -3373,8 +3373,8 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; ; GCN2-LABEL: atomic_or_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -3391,12 +3391,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; ; GCN3-LABEL: atomic_or_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3414,18 +3414,18 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3433,18 +3433,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN2-LABEL: atomic_or_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3452,11 +3452,11 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN3-LABEL: atomic_or_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3476,18 +3476,18 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3498,18 +3498,18 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3520,11 +3520,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3548,12 +3548,12 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3561,12 +3561,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3574,11 +3574,11 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3592,8 +3592,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3608,8 +3608,8 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3624,12 +3624,12 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3646,16 +3646,16 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3663,16 +3663,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN2-LABEL: atomic_or_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3680,11 +3680,11 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN3-LABEL: atomic_or_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3703,16 +3703,16 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3723,16 +3723,16 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; ; GCN2-LABEL: atomic_or_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3743,11 +3743,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; ; GCN3-LABEL: atomic_or_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3770,14 +3770,14 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3785,14 +3785,14 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3800,11 +3800,11 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3819,14 +3819,14 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; GCN1-LABEL: atomic_xchg_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3834,14 +3834,14 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN2-LABEL: atomic_xchg_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3849,11 +3849,11 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN3-LABEL: atomic_xchg_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3868,8 +3868,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -3886,8 +3886,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_xchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -3904,12 +3904,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_xchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3927,18 +3927,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3946,18 +3946,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN2-LABEL: atomic_xchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3965,11 +3965,11 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN3-LABEL: atomic_xchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3989,18 +3989,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4011,18 +4011,18 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4033,11 +4033,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4061,12 +4061,12 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4074,12 +4074,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4087,11 +4087,11 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4105,8 +4105,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4121,8 +4121,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4137,12 +4137,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4159,16 +4159,16 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4176,16 +4176,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4193,11 +4193,11 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4216,16 +4216,16 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4236,16 +4236,16 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4256,11 +4256,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4285,7 +4285,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4300,7 +4300,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN2-LABEL: atomic_cmpxchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4315,7 +4315,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN3-LABEL: atomic_cmpxchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 @@ -4334,8 +4334,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 16 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 @@ -4353,8 +4353,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 16 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 @@ -4372,13 +4372,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4397,19 +4397,19 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4418,19 +4418,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4439,12 +4439,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4465,19 +4465,19 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4489,19 +4489,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s9 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4513,12 +4513,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4544,7 +4544,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 @@ -4557,7 +4557,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN2-LABEL: atomic_cmpxchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 @@ -4570,7 +4570,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN3-LABEL: atomic_cmpxchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 @@ -4588,8 +4588,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 @@ -4605,8 +4605,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 @@ -4622,13 +4622,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4646,17 +4646,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4665,17 +4665,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4684,12 +4684,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4709,17 +4709,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4731,17 +4731,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s9 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4753,12 +4753,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4783,14 +4783,14 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4798,14 +4798,14 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4813,11 +4813,11 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4832,8 +4832,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -4850,8 +4850,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_xor_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -4868,12 +4868,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_xor_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4891,18 +4891,18 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4910,18 +4910,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_xor_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4929,11 +4929,11 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_xor_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4953,18 +4953,18 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4975,18 +4975,18 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4997,11 +4997,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5025,12 +5025,12 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5038,12 +5038,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5051,11 +5051,11 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5069,8 +5069,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -5085,8 +5085,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -5101,12 +5101,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -5123,16 +5123,16 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5140,16 +5140,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xor_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5157,11 +5157,11 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xor_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5180,16 +5180,16 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5200,16 +5200,16 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5220,11 +5220,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5247,7 +5247,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5279,7 +5279,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5300,7 +5300,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5314,7 +5314,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5328,7 +5328,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5348,8 +5348,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5368,8 +5368,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5388,10 +5388,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5414,8 +5414,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5432,8 +5432,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5450,10 +5450,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5475,37 +5475,37 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -5518,33 +5518,33 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -5556,8 +5556,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5572,8 +5572,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5588,15 +5588,15 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_store_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -5609,8 +5609,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5623,8 +5623,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; ; GCN2-LABEL: atomic_store_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5637,15 +5637,15 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; ; GCN3-LABEL: atomic_store_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5657,7 +5657,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5673,7 +5673,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5689,7 +5689,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5710,7 +5710,7 @@ entry: define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5724,7 +5724,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5738,7 +5738,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5758,8 +5758,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5778,8 +5778,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5798,10 +5798,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5824,8 +5824,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5842,8 +5842,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5860,10 +5860,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5885,37 +5885,37 @@ entry: define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -5928,33 +5928,33 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -5966,8 +5966,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5982,8 +5982,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; ; GCN2-LABEL: atomic_store_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5998,15 +5998,15 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; ; GCN3-LABEL: atomic_store_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6019,8 +6019,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -6033,8 +6033,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; ; GCN2-LABEL: atomic_store_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -6047,15 +6047,15 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; ; GCN3-LABEL: atomic_store_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6067,7 +6067,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6083,7 +6083,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6099,7 +6099,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6120,7 +6120,7 @@ entry: define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6134,7 +6134,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6148,7 +6148,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6168,8 +6168,8 @@ entry: define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 @@ -6187,8 +6187,8 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; ; GCN2-LABEL: atomic_load_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 @@ -6206,11 +6206,11 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; ; GCN3-LABEL: atomic_load_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s4, s2 +; GCN3-NEXT: s_addc_u32 s1, s5, s3 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc @@ -6231,37 +6231,37 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6274,33 +6274,33 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6312,8 +6312,8 @@ entry: define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, s6 ; GCN1-NEXT: s_addc_u32 s1, s5, s7 @@ -6327,8 +6327,8 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; ; GCN2-LABEL: atomic_store_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, s6 ; GCN2-NEXT: s_addc_u32 s1, s5, s7 @@ -6342,14 +6342,14 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; ; GCN3-LABEL: atomic_store_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s0, s4, s6 ; GCN3-NEXT: s_addc_u32 s1, s5, s7 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6362,7 +6362,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6378,7 +6378,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6394,7 +6394,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6415,7 +6415,7 @@ entry: define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6429,7 +6429,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6443,7 +6443,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6463,8 +6463,8 @@ entry: define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -6483,8 +6483,8 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -6503,10 +6503,10 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -6529,37 +6529,37 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6572,33 +6572,33 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6610,8 +6610,8 @@ entry: define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -6626,8 +6626,8 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -6642,15 +6642,15 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_store_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6663,37 +6663,37 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6706,33 +6706,33 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6744,33 +6744,33 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6782,33 +6782,33 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6819,14 +6819,14 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6834,14 +6834,14 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6849,11 +6849,11 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6868,14 +6868,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6883,14 +6883,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6898,11 +6898,11 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6917,14 +6917,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6932,14 +6932,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6947,11 +6947,11 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -6968,8 +6968,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -6986,8 +6986,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_inc_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -7004,12 +7004,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_inc_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7027,18 +7027,18 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7046,18 +7046,18 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_inc_i32_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7065,11 +7065,11 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_inc_i32_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7089,18 +7089,18 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7111,18 +7111,18 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7133,11 +7133,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7161,12 +7161,12 @@ entry: define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7174,12 +7174,12 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7187,11 +7187,11 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7205,8 +7205,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -7221,8 +7221,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -7237,12 +7237,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7259,16 +7259,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7276,16 +7276,16 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_inc_i32_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7293,11 +7293,11 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_inc_i32_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7316,16 +7316,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7336,16 +7336,16 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7356,11 +7356,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7383,14 +7383,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7398,14 +7398,14 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7413,11 +7413,11 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7432,14 +7432,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7447,14 +7447,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7462,11 +7462,11 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7481,14 +7481,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7496,14 +7496,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7511,11 +7511,11 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -7532,8 +7532,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -7550,8 +7550,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_dec_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -7568,12 +7568,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_dec_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7591,18 +7591,18 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7610,18 +7610,18 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_dec_i32_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7629,11 +7629,11 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_dec_i32_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7653,18 +7653,18 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7675,18 +7675,18 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7697,11 +7697,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7725,12 +7725,12 @@ entry: define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7738,12 +7738,12 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7751,11 +7751,11 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7769,8 +7769,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -7785,8 +7785,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -7801,12 +7801,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7823,16 +7823,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7840,16 +7840,16 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_dec_i32_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7857,11 +7857,11 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_dec_i32_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7880,16 +7880,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7900,16 +7900,16 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7920,11 +7920,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7947,7 +7947,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -7963,7 +7963,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -7979,7 +7979,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -7999,7 +7999,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8013,7 +8013,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8027,7 +8027,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -8046,7 +8046,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -8062,7 +8062,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -8078,7 +8078,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -8098,7 +8098,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8112,7 +8112,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8126,7 +8126,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 4d80e9124f41f9..5bd527149572e5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -3823,7 +3823,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -3853,7 +3853,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -3883,7 +3883,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -3918,8 +3918,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -3953,8 +3953,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -3988,32 +3988,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4029,7 +4029,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4057,7 +4057,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -4085,7 +4085,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -4119,8 +4119,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -4152,8 +4152,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -4185,32 +4185,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4966,7 +4966,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4996,7 +4996,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -5026,7 +5026,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -5061,8 +5061,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -5096,8 +5096,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -5131,32 +5131,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -5172,8 +5172,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -5205,8 +5205,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -5238,32 +5238,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6760,7 +6760,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -6790,7 +6790,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -6820,7 +6820,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -6855,8 +6855,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -6890,8 +6890,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -6925,32 +6925,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6966,8 +6966,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 @@ -6990,8 +6990,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 @@ -7014,17 +7014,17 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s4, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7043,8 +7043,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -7076,8 +7076,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -7109,32 +7109,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 5420733b7dc557..b8c8d993d389bd 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_add_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -21,7 +21,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_add_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -36,7 +36,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -53,8 +53,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -72,8 +72,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_add_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -92,8 +92,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_add_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -113,8 +113,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -132,8 +132,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_add_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -152,8 +152,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_add_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -174,7 +174,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -195,7 +195,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_add_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -216,7 +216,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -240,7 +240,7 @@ entry: define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_add_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -253,7 +253,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_add_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -266,7 +266,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -282,8 +282,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -299,8 +299,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_add_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -317,8 +317,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_add_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -337,8 +337,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -354,8 +354,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_add_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -372,8 +372,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_add_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -393,7 +393,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -412,7 +412,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_add_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -431,7 +431,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -454,7 +454,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_and_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -469,7 +469,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_and_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -484,7 +484,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -501,8 +501,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -520,8 +520,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_and_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -540,8 +540,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_and_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -561,8 +561,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -580,8 +580,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_and_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -600,8 +600,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_and_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -622,7 +622,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -643,7 +643,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_and_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -664,7 +664,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -688,7 +688,7 @@ entry: define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_and_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -701,7 +701,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_and_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -714,7 +714,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -730,8 +730,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -747,8 +747,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_and_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -765,8 +765,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_and_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -785,8 +785,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -802,8 +802,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_and_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -820,8 +820,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_and_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -841,7 +841,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -860,7 +860,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_and_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -879,7 +879,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -902,7 +902,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -917,7 +917,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_sub_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -932,7 +932,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -949,8 +949,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -968,8 +968,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_sub_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -988,8 +988,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_sub_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1009,8 +1009,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1028,8 +1028,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_sub_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1048,8 +1048,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_sub_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1070,7 +1070,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1091,7 +1091,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1136,7 +1136,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_sub_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_sub_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -1162,7 +1162,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1178,8 +1178,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1195,8 +1195,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_sub_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1213,8 +1213,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_sub_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1233,8 +1233,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1250,8 +1250,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_sub_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1268,8 +1268,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_sub_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1289,7 +1289,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1308,7 +1308,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1327,7 +1327,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1350,7 +1350,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_max_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -1364,7 +1364,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_max_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -1378,7 +1378,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1395,8 +1395,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -1414,8 +1414,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_max_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -1434,8 +1434,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_max_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1455,8 +1455,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1473,8 +1473,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_max_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1492,8 +1492,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_max_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1514,7 +1514,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1535,7 +1535,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1556,7 +1556,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1580,7 +1580,7 @@ entry: define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_max_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_max_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -1604,7 +1604,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1620,8 +1620,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1637,8 +1637,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_max_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1655,8 +1655,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_max_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1675,8 +1675,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1691,8 +1691,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_max_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1708,8 +1708,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_max_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1729,7 +1729,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1748,7 +1748,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_max_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1767,7 +1767,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1790,7 +1790,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -1804,7 +1804,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umax_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -1818,7 +1818,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1835,8 +1835,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -1854,8 +1854,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umax_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -1874,8 +1874,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umax_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1895,8 +1895,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1913,8 +1913,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_umax_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1932,8 +1932,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umax_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1954,7 +1954,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1975,7 +1975,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1996,7 +1996,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2020,7 +2020,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umax_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -2032,7 +2032,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umax_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -2044,7 +2044,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2060,8 +2060,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2077,8 +2077,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_umax_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2095,8 +2095,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_umax_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2115,8 +2115,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2131,8 +2131,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_umax_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2148,8 +2148,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umax_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2169,7 +2169,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2188,7 +2188,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2207,7 +2207,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2230,7 +2230,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -2244,7 +2244,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -2258,7 +2258,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2275,8 +2275,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -2294,8 +2294,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_min_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -2314,8 +2314,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_min_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2335,8 +2335,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2353,8 +2353,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_min_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2372,8 +2372,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_min_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2394,7 +2394,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2415,7 +2415,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2436,7 +2436,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2460,7 +2460,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -2472,7 +2472,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -2484,7 +2484,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2500,8 +2500,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2517,8 +2517,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2535,8 +2535,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_min_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2555,8 +2555,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2571,8 +2571,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_min_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2588,8 +2588,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_min_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2609,7 +2609,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2628,7 +2628,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_min_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2647,7 +2647,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2670,7 +2670,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -2684,7 +2684,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umin_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -2698,7 +2698,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2715,8 +2715,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -2734,8 +2734,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umin_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -2754,8 +2754,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umin_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2775,8 +2775,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2793,8 +2793,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_umin_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2812,8 +2812,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umin_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2834,7 +2834,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2855,7 +2855,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2876,7 +2876,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2900,7 +2900,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umin_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -2912,7 +2912,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umin_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -2924,7 +2924,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2940,8 +2940,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2957,8 +2957,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_umin_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2975,8 +2975,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_umin_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2995,8 +2995,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3011,8 +3011,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_umin_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3028,8 +3028,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umin_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3049,7 +3049,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3068,7 +3068,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3087,7 +3087,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3110,7 +3110,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_or_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3125,7 +3125,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_or_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3140,7 +3140,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3157,8 +3157,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -3176,8 +3176,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; ; GCN2-LABEL: atomic_or_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -3196,8 +3196,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-LABEL: atomic_or_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3217,8 +3217,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3236,8 +3236,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; ; GCN2-LABEL: atomic_or_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3256,8 +3256,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-LABEL: atomic_or_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3278,7 +3278,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3299,7 +3299,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_or_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3320,7 +3320,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3344,7 +3344,7 @@ entry: define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_or_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -3357,7 +3357,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_or_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -3370,7 +3370,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3386,8 +3386,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3403,8 +3403,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_or_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3421,8 +3421,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_or_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3441,8 +3441,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3458,8 +3458,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; ; GCN2-LABEL: atomic_or_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3476,8 +3476,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-LABEL: atomic_or_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3497,7 +3497,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3516,7 +3516,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; ; GCN2-LABEL: atomic_or_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3535,7 +3535,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3558,7 +3558,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3573,7 +3573,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xchg_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3588,7 +3588,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3605,7 +3605,7 @@ entry: define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN1-LABEL: atomic_xchg_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3620,7 +3620,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; ; GCN2-LABEL: atomic_xchg_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3635,7 +3635,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3652,7 +3652,7 @@ entry: define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN1-LABEL: atomic_xchg_pointer_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3667,7 +3667,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; ; GCN2-LABEL: atomic_xchg_pointer_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3682,7 +3682,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3699,8 +3699,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -3718,8 +3718,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_xchg_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -3738,8 +3738,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_xchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3759,8 +3759,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3778,8 +3778,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_xchg_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3798,8 +3798,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3820,7 +3820,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3841,7 +3841,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3862,7 +3862,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3886,7 +3886,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -3899,7 +3899,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xchg_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -3912,7 +3912,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3928,8 +3928,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3945,8 +3945,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_xchg_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3963,8 +3963,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_xchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3983,8 +3983,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4000,8 +4000,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_xchg_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4018,8 +4018,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4039,7 +4039,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4058,7 +4058,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4077,7 +4077,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4100,7 +4100,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4115,7 +4115,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xor_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4130,7 +4130,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4147,8 +4147,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -4166,8 +4166,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_xor_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -4186,8 +4186,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_xor_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -4207,8 +4207,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4226,8 +4226,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_xor_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4246,8 +4246,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_xor_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4268,7 +4268,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4289,7 +4289,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4310,7 +4310,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4334,7 +4334,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xor_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -4347,7 +4347,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xor_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -4360,7 +4360,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4376,8 +4376,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4393,8 +4393,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_xor_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4411,8 +4411,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_xor_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -4431,8 +4431,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4448,8 +4448,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_xor_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4466,8 +4466,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xor_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4487,7 +4487,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4506,7 +4506,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4525,7 +4525,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4548,7 +4548,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4564,7 +4564,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4580,7 +4580,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4599,7 +4599,7 @@ entry: define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -4613,7 +4613,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -4627,7 +4627,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4645,8 +4645,8 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4665,8 +4665,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4686,8 +4686,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-LABEL: atomic_load_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4710,8 +4710,8 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4728,8 +4728,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4747,8 +4747,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-LABEL: atomic_load_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4770,7 +4770,7 @@ entry: define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_u32 s0, s2, 32 @@ -4783,7 +4783,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_u32 s0, s2, 32 @@ -4796,7 +4796,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4811,7 +4811,7 @@ entry: define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -4822,7 +4822,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -4833,7 +4833,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4847,8 +4847,8 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4864,8 +4864,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4882,8 +4882,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-LABEL: atomic_store_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -4902,8 +4902,8 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4917,8 +4917,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; ; GCN2-LABEL: atomic_store_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4933,8 +4933,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-LABEL: atomic_store_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -4952,8 +4952,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 32 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 @@ -4970,8 +4970,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; ; GCN2-LABEL: atomic_cmpxchg_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 32 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 @@ -4989,8 +4989,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-LABEL: atomic_cmpxchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5008,8 +5008,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_soffset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 0x11940 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 @@ -5026,8 +5026,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; ; GCN2-LABEL: atomic_cmpxchg_i64_soffset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 0x11940 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 @@ -5045,8 +5045,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-LABEL: atomic_cmpxchg_i64_soffset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5064,7 +5064,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5084,7 +5084,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5104,7 +5104,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -5126,7 +5126,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5146,7 +5146,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; ; GCN2-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5166,7 +5166,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5187,8 +5187,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -5211,8 +5211,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -5236,8 +5236,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 @@ -5262,8 +5262,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, s4 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 @@ -5278,8 +5278,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; ; GCN2-LABEL: atomic_cmpxchg_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 @@ -5295,8 +5295,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX12-LABEL: atomic_cmpxchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5313,7 +5313,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: v_mov_b32_e32 v5, s1 @@ -5331,7 +5331,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: v_mov_b32_e32 v5, s1 @@ -5349,7 +5349,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -5370,7 +5370,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5388,7 +5388,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; ; GCN2-LABEL: atomic_cmpxchg_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5406,7 +5406,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5426,8 +5426,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN1-NEXT: s_add_u32 s2, s4, s2 @@ -5448,8 +5448,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN2-NEXT: s_add_u32 s2, s4, s2 @@ -5471,8 +5471,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 @@ -5496,7 +5496,7 @@ entry: define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5512,7 +5512,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5528,7 +5528,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5547,7 +5547,7 @@ entry: define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5561,7 +5561,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5575,7 +5575,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5593,8 +5593,8 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5613,8 +5613,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_f64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5634,8 +5634,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-LABEL: atomic_load_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5658,8 +5658,8 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5676,8 +5676,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_f64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5695,8 +5695,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-LABEL: atomic_load_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5718,7 +5718,7 @@ entry: define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GCN1-LABEL: atomic_store_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_u32 s0, s2, 32 @@ -5731,7 +5731,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_u32 s0, s2, 32 @@ -5744,7 +5744,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5759,7 +5759,7 @@ entry: define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GCN1-LABEL: atomic_store_f64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5770,7 +5770,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5781,7 +5781,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5795,8 +5795,8 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5812,8 +5812,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; ; GCN2-LABEL: atomic_store_f64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5830,8 +5830,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-LABEL: atomic_store_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -5850,8 +5850,8 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5865,8 +5865,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; ; GCN2-LABEL: atomic_store_f64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5881,8 +5881,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-LABEL: atomic_store_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -5900,7 +5900,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5915,7 +5915,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_inc_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5930,7 +5930,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5947,8 +5947,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -5966,8 +5966,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_inc_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -5986,8 +5986,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_inc_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6007,8 +6007,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6026,8 +6026,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_inc_i64_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6046,8 +6046,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_inc_i64_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6068,7 +6068,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6089,7 +6089,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6110,7 +6110,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6134,7 +6134,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_inc_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6147,7 +6147,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_inc_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6160,7 +6160,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_inc_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6176,8 +6176,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6193,8 +6193,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_inc_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6211,8 +6211,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_inc_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6231,8 +6231,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6248,8 +6248,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_inc_i64_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6266,8 +6266,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_inc_i64_incr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6287,7 +6287,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6306,7 +6306,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6325,7 +6325,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6348,7 +6348,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6363,7 +6363,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_dec_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6378,7 +6378,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6395,8 +6395,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -6414,8 +6414,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_dec_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -6434,8 +6434,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_dec_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6455,8 +6455,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6474,8 +6474,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_dec_i64_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6494,8 +6494,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_dec_i64_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6516,7 +6516,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6537,7 +6537,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6558,7 +6558,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6582,7 +6582,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_dec_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6595,7 +6595,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_dec_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6608,7 +6608,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_dec_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6624,8 +6624,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6641,8 +6641,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_dec_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6659,8 +6659,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_dec_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6679,8 +6679,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6696,8 +6696,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_dec_i64_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6714,8 +6714,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_dec_i64_decr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6735,7 +6735,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6754,7 +6754,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6773,7 +6773,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 7e4a36b7dc11b4..d812b4b7d86e6c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -4258,8 +4258,8 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4292,8 +4292,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_max_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4326,10 +4326,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN3-LABEL: atomic_max_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -4365,7 +4365,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -4402,7 +4402,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -4439,7 +4439,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -4482,8 +4482,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4514,8 +4514,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_max_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4546,10 +4546,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN3-LABEL: atomic_max_i64_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -4584,7 +4584,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -4619,7 +4619,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_max_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -4654,7 +4654,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN3-LABEL: atomic_max_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -5640,8 +5640,8 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5674,8 +5674,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_umax_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5708,10 +5708,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN3-LABEL: atomic_umax_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -5747,7 +5747,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -5784,7 +5784,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -5821,7 +5821,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -5864,7 +5864,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -5899,7 +5899,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -5934,7 +5934,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -7864,8 +7864,8 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -7898,8 +7898,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_min_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -7932,10 +7932,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN3-LABEL: atomic_min_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -7971,7 +7971,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -8008,7 +8008,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -8045,7 +8045,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -8088,7 +8088,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 @@ -8118,7 +8118,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -8148,7 +8148,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GCN3-LABEL: atomic_min_i64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -8183,7 +8183,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -8218,7 +8218,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_min_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -8253,7 +8253,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN3-LABEL: atomic_min_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index 4846e21fe836eb..bac2d8b8b40c26 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -21,7 +21,7 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0 define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -41,9 +41,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -77,7 +75,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -103,9 +101,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -150,7 +146,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -170,9 +166,7 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -206,7 +200,7 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -226,9 +220,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -262,7 +254,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -288,9 +280,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -335,7 +325,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -355,9 +345,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -391,7 +379,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -417,9 +405,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -464,7 +450,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -484,9 +470,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -522,7 +506,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -548,9 +532,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -597,7 +579,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -623,9 +605,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -672,7 +652,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -698,7 +678,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -723,9 +703,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -749,9 +727,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -798,7 +774,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -824,7 +800,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -849,9 +825,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -875,9 +849,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -927,56 +899,56 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -994,8 +966,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1020,56 +992,56 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1087,8 +1059,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1113,55 +1085,55 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1180,8 +1152,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1206,55 +1178,55 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1273,8 +1245,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1299,55 +1271,55 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1366,8 +1338,8 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1392,55 +1364,55 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1459,8 +1431,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1485,55 +1457,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1552,8 +1524,8 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1578,55 +1550,55 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1645,8 +1617,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1671,55 +1643,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1738,8 +1710,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1764,55 +1736,55 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1831,8 +1803,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1857,55 +1829,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1924,8 +1896,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1950,55 +1922,55 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -2017,8 +1989,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -2047,7 +2019,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_interp: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s10, -1 ; SI-NOFMA-NEXT: s_mov_b32 s14, s10 @@ -2079,7 +2051,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; SI-FMA-LABEL: test_f32_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2109,7 +2081,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f32_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2130,7 +2102,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f32_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2163,7 +2135,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; SI-FMA-LABEL: test_f64_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2193,7 +2165,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f64_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2214,7 +2186,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f64_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2248,7 +2220,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_neg_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2264,9 +2236,7 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -2296,7 +2266,7 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2312,9 +2282,7 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: fma_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -2344,7 +2312,7 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 { ; SI-LABEL: fma_neg_b_c_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0 @@ -2365,9 +2333,7 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: fma_neg_b_c_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll index 39a9a85081af59..93ed64d93b8ba4 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.ll @@ -159,15 +159,15 @@ define float @fold_fmul_distributive(float %x, float %y) { define amdgpu_kernel void @vec_mul_scalar_add_fma(<2 x float> %a, <2 x float> %b, float %c1, ptr addrspace(1) %inptr) { ; GFX906-LABEL: vec_mul_scalar_add_fma: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX906-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, s8 ; GFX906-NEXT: v_mov_b32_e32 v2, s6 ; GFX906-NEXT: v_fmac_f32_e32 v1, s4, v2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] offset:4 ; GFX906-NEXT: s_endpgm %gep = getelementptr float, ptr addrspace(1) %inptr, i32 1 %c = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 84852c2632f671..23eb73038917d2 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -37,7 +37,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -97,7 +97,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -139,7 +139,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -169,7 +169,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -199,7 +199,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -229,7 +229,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -270,7 +270,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -304,7 +304,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -338,7 +338,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -368,7 +368,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -410,7 +410,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -444,7 +444,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -478,7 +478,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -508,7 +508,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll index 018399983a863d..01b2f207388e8a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_uge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -59,7 +59,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -80,7 +80,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_oge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -111,7 +111,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -132,7 +132,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_ugt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -163,7 +163,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_ogt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -184,7 +184,7 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_ogt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll index 3b7009023b03af..87ac95a1cd7390 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll @@ -262,8 +262,8 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fmaximumi_f32_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS @@ -286,8 +286,8 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fmaximum_f16_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 84099e472d65fd..764fb992d4d34c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -28,7 +28,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -45,7 +45,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -63,7 +63,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -83,7 +83,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -95,14 +95,13 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -123,7 +122,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -140,7 +139,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -157,7 +156,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -175,7 +174,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -195,7 +194,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -207,14 +206,13 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -236,7 +234,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -253,7 +251,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -270,7 +268,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -288,7 +286,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -308,7 +306,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -320,14 +318,13 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -349,7 +346,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -366,7 +363,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -383,7 +380,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -401,7 +398,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -421,7 +418,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -433,14 +430,13 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -462,7 +458,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -480,7 +476,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -498,7 +494,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -517,7 +513,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -538,7 +534,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -551,14 +547,13 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 4.0, 2.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -580,7 +575,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -601,7 +596,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -622,7 +617,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -644,7 +639,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -668,7 +663,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX9-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -684,7 +679,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -700,16 +695,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX11-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1 ; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -721,14 +714,13 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX11-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v2, v1, 2.0, 4.0 ; GFX11-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc @@ -755,7 +747,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -773,7 +765,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_fmed3_r_i_i_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -791,7 +783,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -810,7 +802,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_fmed3_r_i_i_f64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -831,7 +823,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_fmed3_r_i_i_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -844,16 +836,14 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_fmed3_r_i_i_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -875,7 +865,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -891,7 +881,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -907,7 +897,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -924,7 +914,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -943,7 +933,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -954,9 +944,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -981,7 +969,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -998,7 +986,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1016,7 +1004,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1034,7 +1022,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1057,7 +1045,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX9-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1069,7 +1057,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX9-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -1084,14 +1072,13 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX11-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -1100,17 +1087,15 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX11-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 2.0, v1 ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 4.0, v1 ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1138,7 +1123,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1162,7 +1147,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1185,7 +1170,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1212,7 +1197,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1244,7 +1229,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1259,7 +1244,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1275,9 +1260,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1294,9 +1277,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1332,7 +1313,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1356,7 +1337,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1379,7 +1360,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1406,7 +1387,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1438,7 +1419,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1453,7 +1434,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1469,9 +1450,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1488,9 +1467,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1526,7 +1503,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1550,7 +1527,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1573,7 +1550,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1600,7 +1577,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1632,7 +1609,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1647,7 +1624,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1663,9 +1640,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1682,9 +1657,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1720,7 +1693,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1744,7 +1717,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1768,7 +1741,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1795,7 +1768,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1828,7 +1801,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1843,7 +1816,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1860,9 +1833,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1879,9 +1850,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1924,7 +1893,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1948,7 +1917,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1973,7 +1942,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2000,7 +1969,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2034,7 +2003,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2049,7 +2018,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2067,9 +2036,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2086,9 +2053,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2134,7 +2099,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2161,7 +2126,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2186,7 +2151,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2216,7 +2181,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2250,7 +2215,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2268,9 +2233,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2311,7 +2274,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2335,7 +2298,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; SI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2357,7 +2320,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; VI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2384,7 +2347,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; VI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2415,7 +2378,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2430,9 +2393,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2465,7 +2426,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2489,7 +2450,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_nnan_call_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2511,7 +2472,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2538,7 +2499,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_nnan_call_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2569,7 +2530,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_nnan_call_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2584,9 +2545,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_nnan_call_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2619,7 +2578,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2643,7 +2602,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_fast_call_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2665,7 +2624,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_fast_call_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2692,7 +2651,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_fast_call_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2723,7 +2682,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_fast_call_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2738,9 +2697,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_fast_call_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2785,7 +2742,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2809,7 +2766,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2831,7 +2788,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2858,7 +2815,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2889,7 +2846,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2904,9 +2861,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2939,7 +2894,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2963,7 +2918,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2985,7 +2940,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3012,7 +2967,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3043,7 +2998,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3058,9 +3013,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3093,7 +3046,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3117,7 +3070,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3140,7 +3093,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3167,7 +3120,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3199,7 +3152,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3214,7 +3167,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3230,9 +3183,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3249,9 +3200,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3287,7 +3236,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3311,7 +3260,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3333,7 +3282,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3360,7 +3309,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3391,7 +3340,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3406,9 +3355,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3441,7 +3388,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3465,7 +3412,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3487,7 +3434,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3514,7 +3461,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3545,7 +3492,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3560,9 +3507,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3595,7 +3540,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3619,7 +3564,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3641,7 +3586,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3668,7 +3613,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3699,7 +3644,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3714,9 +3659,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3749,7 +3692,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3773,7 +3716,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3795,7 +3738,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3822,7 +3765,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3853,7 +3796,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3868,9 +3811,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3903,7 +3844,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3927,7 +3868,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3949,7 +3890,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3976,7 +3917,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4007,7 +3948,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4022,9 +3963,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4057,7 +3996,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4081,7 +4020,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4103,7 +4042,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4130,7 +4069,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4161,7 +4100,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4176,9 +4115,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4211,7 +4148,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4235,7 +4172,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4257,7 +4194,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4284,7 +4221,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4315,7 +4252,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4330,9 +4267,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4365,7 +4300,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4389,7 +4324,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4411,7 +4346,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4438,7 +4373,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4469,7 +4404,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4484,9 +4419,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4519,7 +4452,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4543,7 +4476,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4565,7 +4498,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4592,7 +4525,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4623,7 +4556,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4638,9 +4571,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4673,7 +4604,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4697,7 +4628,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4719,7 +4650,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4746,7 +4677,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4777,7 +4708,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4792,9 +4723,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4827,7 +4756,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4851,7 +4780,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4873,7 +4802,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4900,7 +4829,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4931,7 +4860,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4946,9 +4875,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4981,7 +4908,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5005,7 +4932,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5027,7 +4954,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5054,7 +4981,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5085,7 +5012,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5100,9 +5027,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5135,7 +5060,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5159,7 +5084,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5181,7 +5106,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5208,7 +5133,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5239,7 +5164,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5254,9 +5179,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5289,7 +5212,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5313,7 +5236,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5335,7 +5258,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5362,7 +5285,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5393,7 +5316,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5408,9 +5331,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5446,7 +5367,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5470,7 +5391,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5492,7 +5413,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5519,7 +5440,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5550,7 +5471,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5565,9 +5486,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5604,7 +5523,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5637,7 +5556,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5669,7 +5588,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5704,7 +5623,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5743,7 +5662,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5766,9 +5685,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5808,7 +5725,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5841,7 +5758,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5873,7 +5790,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5908,7 +5825,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5947,7 +5864,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5970,9 +5887,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5981,10 +5896,9 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX11-SDAG-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 +; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX11-SDAG-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v1, v2 ; GFX11-SDAG-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3 @@ -5997,9 +5911,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6039,7 +5951,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6072,7 +5984,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6104,7 +6016,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6139,7 +6051,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6178,7 +6090,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6201,9 +6113,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6243,7 +6153,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6272,7 +6182,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6300,7 +6210,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6333,7 +6243,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6370,7 +6280,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6391,9 +6301,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6414,9 +6322,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6453,7 +6359,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6480,7 +6386,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6505,7 +6411,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6535,7 +6441,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6569,7 +6475,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6587,9 +6493,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6630,7 +6534,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6657,7 +6561,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6682,7 +6586,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6712,7 +6616,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6746,7 +6650,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6764,9 +6668,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6807,7 +6709,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6834,7 +6736,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6859,7 +6761,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6889,7 +6791,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6923,7 +6825,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6941,9 +6843,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6984,7 +6884,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7008,7 +6908,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7031,7 +6931,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7058,7 +6958,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7090,7 +6990,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7105,7 +7005,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7121,9 +7021,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7140,9 +7038,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7178,7 +7074,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7204,7 +7100,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7230,7 +7126,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7260,7 +7156,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7295,7 +7191,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7313,7 +7209,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7332,9 +7228,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7353,9 +7247,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7394,7 +7286,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_min_max_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7419,7 +7311,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_global_nnans_min_max_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7442,7 +7334,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_global_nnans_min_max_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7470,7 +7362,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_global_nnans_min_max_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7502,7 +7394,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_global_nnans_min_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7518,9 +7410,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; GFX11-LABEL: v_test_global_nnans_min_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7551,7 +7441,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -7570,7 +7460,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -7597,7 +7487,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7616,7 +7506,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7637,7 +7527,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -7649,14 +7539,13 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f16 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -7677,7 +7566,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -7708,7 +7597,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7755,7 +7644,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7788,7 +7677,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7825,7 +7714,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -7843,9 +7732,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -7887,7 +7774,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: two_non_inline_constant: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7905,7 +7792,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: two_non_inline_constant: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -7923,7 +7810,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: two_non_inline_constant: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7942,7 +7829,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: two_non_inline_constant: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7963,7 +7850,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: two_non_inline_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -7976,9 +7863,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX11-SDAG-LABEL: two_non_inline_constant: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -7994,15 +7879,14 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX11-GISEL-LABEL: two_non_inline_constant: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, 0x41000000, v2 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -8024,7 +7908,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: one_non_inline_constant: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -8046,7 +7930,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: one_non_inline_constant: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -8068,7 +7952,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: one_non_inline_constant: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8090,7 +7974,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: one_non_inline_constant: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -8114,7 +7998,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: one_non_inline_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8130,9 +8014,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: one_non_inline_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -8165,7 +8047,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: two_non_inline_constant_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -8191,7 +8073,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: two_non_inline_constant_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -8217,7 +8099,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: two_non_inline_constant_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8243,7 +8125,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: two_non_inline_constant_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41000000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41800000 @@ -8271,7 +8153,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX9-SDAG-LABEL: two_non_inline_constant_multi_use: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8291,7 +8173,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX9-GISEL-LABEL: two_non_inline_constant_multi_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000 @@ -8311,9 +8193,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX11-SDAG-LABEL: two_non_inline_constant_multi_use: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -8335,15 +8215,13 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX11-GISEL-LABEL: two_non_inline_constant_multi_use: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0.5, v1 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0x41800000 :: v_dual_add_f32 v3, 0.5, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v2, v3, 0x41000000, v2 ; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0x41800000, v1 ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 3a55b2d50a5e54..7337d90b4bea63 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -37,7 +37,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -97,7 +97,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -139,7 +139,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -169,7 +169,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -199,7 +199,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -229,7 +229,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -270,7 +270,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -304,7 +304,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -338,7 +338,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -368,7 +368,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -410,7 +410,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -444,7 +444,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -478,7 +478,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -508,7 +508,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -612,7 +612,7 @@ entry: define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -646,7 +646,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -680,7 +680,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -714,7 +714,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -759,7 +759,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -793,7 +793,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -827,7 +827,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -861,7 +861,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll index 85653ded63ce6f..d20c39d5103649 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -26,7 +26,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_uge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -57,7 +57,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -78,7 +78,7 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ugt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -109,7 +109,7 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ule_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -130,7 +130,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ule_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -161,7 +161,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ult_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -182,7 +182,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ult_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -213,7 +213,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -234,7 +234,7 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_oge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -265,7 +265,7 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -286,7 +286,7 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ogt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -317,7 +317,7 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ole_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -338,7 +338,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ole_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -369,7 +369,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_olt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -390,7 +390,7 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_olt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll index 817e6dd87361ff..45f6bff10f45ee 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll @@ -262,8 +262,8 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fminimumi_f32_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS @@ -286,8 +286,8 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fminimum_f16_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index c60b9858abd836..7830c91851bfa7 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -15,7 +15,7 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_fadd_use_test_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e64 v0, s3, -1.0 ; VI-NEXT: v_add_f32_e64 v1, s2, -1.0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_fadd_use_test_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -46,7 +46,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_fadd_use_test_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -79,20 +79,20 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, float %x, [8 x i32], float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmac_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-NEXT: s_load_dword s3, s[6:7], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-NEXT: s_load_dword s3, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s2, s0, 4 -; VI-NEXT: v_add_f32_e64 v2, s4, s4 +; VI-NEXT: v_add_f32_e64 v2, s6, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0 +; VI-NEXT: v_mac_f32_e64 v3, s6, 2.0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -101,9 +101,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX10-LABEL: multiple_use_fadd_fmac_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, s2, s2 @@ -117,13 +117,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX11-LABEL: multiple_use_fadd_fmac_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v1, s4, s4 -; GFX11-NEXT: v_fma_f32 v2, s4, 2.0, s5 +; GFX11-NEXT: v_add_f32_e64 v1, s2, s2 +; GFX11-NEXT: v_fma_f32 v2, s2, 2.0, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc @@ -142,7 +142,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 @@ -161,7 +161,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_use_fadd_fmad_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -174,7 +174,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_use_fadd_fmad_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -198,21 +198,21 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_use_fadd_multi_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s6, s4, 4 +; VI-NEXT: s_add_u32 s4, s6, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mad_f32 v2, |s0|, 2.0, v0 ; VI-NEXT: v_mad_f32 v3, |s0|, 2.0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_addc_u32 s7, s5, 0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_addc_u32 s5, s7, 0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm @@ -220,23 +220,23 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; GFX10-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_fma_f32 v1, |s0|, 2.0, s1 ; GFX10-NEXT: v_fma_f32 v2, |s0|, 2.0, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[4:5] offset:4 +; GFX10-NEXT: global_store_dword v0, v2, s[6:7] offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v1, |s4|, 2.0, s5 @@ -261,8 +261,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -275,8 +275,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn2_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, s2, -4.0 @@ -288,12 +288,12 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn2_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, s4, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -310,8 +310,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn3_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 @@ -325,8 +325,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn3_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 @@ -338,12 +338,12 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn3_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s4 +; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -360,8 +360,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_fadd_use_test_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 ; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -378,8 +378,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_fadd_use_test_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -396,13 +396,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -414,12 +414,12 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -433,13 +433,14 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_clause 0x1 +; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s3, -1.0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -447,7 +448,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_mul_f16_e32 v1, v0, v0 ; GFX11-DENORM-NEXT: v_fma_f16 v0, -v1, v0, 1.0 -; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-DENORM-NEXT: s_nop 0 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -455,12 +455,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_clause 0x1 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s3, -1.0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -471,7 +472,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v0, 1.0, v0 -; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -496,14 +496,14 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, s4, 2.0, v0 +; VI-DENORM-NEXT: v_fma_f16 v3, s6, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4 +; VI-DENORM-NEXT: v_add_f16_e64 v2, s6, s6 ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 @@ -517,12 +517,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4 +; VI-FLUSH-NEXT: v_add_f16_e64 v2, s6, s6 ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 ; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s3 @@ -530,7 +530,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 -; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0 +; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s6, 2.0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) @@ -539,8 +539,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -555,8 +555,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 @@ -571,13 +571,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s4, s4 -; GFX11-DENORM-NEXT: v_fma_f16 v2, s4, 2.0, s2 +; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s2, s2 +; GFX11-DENORM-NEXT: v_fma_f16 v2, s2, 2.0, s3 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -589,12 +589,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s4, s4 -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -617,14 +617,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, |s4|, 2.0, v0 +; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4| +; VI-DENORM-NEXT: v_add_f16_e64 v2, |s6|, |s6| ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 @@ -638,14 +638,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 -; VI-FLUSH-NEXT: v_mad_f16 v3, |s4|, 2.0, v0 +; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4| +; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s6|, |s6| ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 ; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 @@ -660,8 +660,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -676,8 +676,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| @@ -692,13 +692,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s4|, |s4| -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s2 +; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s2|, |s2| +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, s3 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -710,12 +710,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -739,9 +739,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-DENORM-NEXT: s_load_dword s6, s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 @@ -762,9 +762,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; ; VI-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-FLUSH-NEXT: s_load_dword s6, s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -786,14 +786,14 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x2 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1 -; GFX10-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0 +; GFX10-DENORM-NEXT: v_fma_f16 v2, |s6|, 2.0, s1 +; GFX10-DENORM-NEXT: v_fma_f16 v1, |s6|, 2.0, s0 ; GFX10-DENORM-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3] offset:2 @@ -803,12 +803,12 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| +; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s6|, |s6| ; GFX10-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0 @@ -821,17 +821,17 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x2 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1 -; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0 -; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[2:3] dlc +; GFX11-DENORM-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s3 +; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s2 +; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3] offset:2 dlc +; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: s_nop 0 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -840,19 +840,19 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x2 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| -; GFX11-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0 -; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0 -; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[2:3] dlc +; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 +; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, s3, v0 +; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[0:1] dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[2:3] offset:2 dlc +; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -873,8 +873,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 @@ -887,8 +887,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn2_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, s2, -4.0 @@ -900,13 +900,13 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn2_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, s4, -4.0 +; GFX11-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -925,8 +925,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn3_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc600 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 @@ -940,8 +940,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn3_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, 0xc600, s2 @@ -953,13 +953,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn3_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s4 +; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll index 7c1c970b3fef78..98faaacf1dfb0a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -7,58 +7,58 @@ define amdgpu_kernel void @fmul_f16( ; SI-LABEL: fmul_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX89-LABEL: fmul_f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX89-NEXT: s_mov_b32 s11, 0xf000 -; GFX89-NEXT: s_mov_b32 s10, -1 -; GFX89-NEXT: s_mov_b32 s14, s10 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_mov_b32 s14, s2 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s12, s6 ; GFX89-NEXT: s_mov_b32 s13, s7 -; GFX89-NEXT: s_mov_b32 s15, s11 -; GFX89-NEXT: s_mov_b32 s2, s10 -; GFX89-NEXT: s_mov_b32 s3, s11 +; GFX89-NEXT: s_mov_b32 s15, s3 +; GFX89-NEXT: s_mov_b32 s10, s2 +; GFX89-NEXT: s_mov_b32 s11, s3 ; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s4 -; GFX89-NEXT: s_mov_b32 s9, s5 +; GFX89-NEXT: s_mov_b32 s0, s4 +; GFX89-NEXT: s_mov_b32 s1, s5 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -93,7 +93,7 @@ entry: define amdgpu_kernel void @fmul_f16_imm_a( ; SI-LABEL: fmul_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -113,7 +113,7 @@ define amdgpu_kernel void @fmul_f16_imm_a( ; ; GFX89-LABEL: fmul_f16_imm_a: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @fmul_f16_imm_a( ; ; GFX11-LABEL: fmul_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -160,7 +160,7 @@ entry: define amdgpu_kernel void @fmul_f16_imm_b( ; SI-LABEL: fmul_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @fmul_f16_imm_b( ; ; GFX89-LABEL: fmul_f16_imm_b: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -198,7 +198,7 @@ define amdgpu_kernel void @fmul_f16_imm_b( ; ; GFX11-LABEL: fmul_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -227,21 +227,21 @@ entry: define amdgpu_kernel void @fmul_v2f16( ; SI-LABEL: fmul_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -256,60 +256,60 @@ define amdgpu_kernel void @fmul_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fmul_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fmul_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -343,7 +343,7 @@ entry: define amdgpu_kernel void @fmul_v2f16_imm_a( ; SI-LABEL: fmul_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; VI-LABEL: fmul_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -390,7 +390,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; GFX9-LABEL: fmul_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -409,7 +409,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; GFX11-LABEL: fmul_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -438,7 +438,7 @@ entry: define amdgpu_kernel void @fmul_v2f16_imm_b( ; SI-LABEL: fmul_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -464,7 +464,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; VI-LABEL: fmul_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -485,7 +485,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; GFX9-LABEL: fmul_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -504,7 +504,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; GFX11-LABEL: fmul_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -533,21 +533,21 @@ entry: define amdgpu_kernel void @fmul_v4f16( ; SI-LABEL: fmul_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s10 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -574,26 +574,26 @@ define amdgpu_kernel void @fmul_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fmul_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v1, v3, v1 @@ -601,37 +601,37 @@ define amdgpu_kernel void @fmul_v4f16( ; VI-NEXT: v_mul_f16_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fmul_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1 ; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -666,7 +666,7 @@ entry: define amdgpu_kernel void @fmul_v4f16_imm_a( ; SI-LABEL: fmul_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -701,7 +701,7 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; VI-LABEL: fmul_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -725,7 +725,7 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; GFX9-LABEL: fmul_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -746,7 +746,7 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; GFX11-LABEL: fmul_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 9300dfcb16e8ad..718be90eb75fc3 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -22,7 +22,7 @@ declare half @llvm.fabs.f16(half) #1 define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmuladd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-DENORM-LABEL: fmuladd_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -62,7 +62,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -78,7 +78,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 @@ -92,7 +92,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-FLUSH-LABEL: fmuladd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -111,7 +111,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-DENORM-LABEL: fmuladd_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -136,7 +136,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -156,7 +156,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -176,7 +176,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-FLUSH-LABEL: fmul_fadd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -192,7 +192,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: s_clause 0x2 @@ -208,7 +208,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2 @@ -222,7 +222,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-FLUSH-LABEL: fmul_fadd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -241,7 +241,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: s_clause 0x2 @@ -260,7 +260,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2 @@ -286,7 +286,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_contract_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -306,7 +306,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; VI-DENORM-LABEL: fmul_fadd_contract_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -326,7 +326,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -342,7 +342,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 @@ -356,7 +356,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -375,7 +375,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX11-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -401,7 +401,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -437,7 +437,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -451,7 +451,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -464,9 +464,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -483,9 +481,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -513,7 +509,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -531,7 +527,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -549,7 +545,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -563,7 +559,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -576,9 +572,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -595,9 +589,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -625,7 +617,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_a_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -643,7 +635,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -661,7 +653,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -675,7 +667,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -689,7 +681,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -702,9 +694,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -721,9 +711,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -740,9 +728,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -773,7 +759,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_b_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -791,7 +777,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -809,7 +795,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -823,7 +809,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -837,7 +823,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -850,9 +836,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -869,9 +853,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -888,9 +870,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -921,7 +901,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -939,7 +919,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -957,7 +937,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -971,7 +951,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -984,9 +964,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1003,9 +981,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1033,7 +1009,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1051,7 +1027,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1069,7 +1045,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1083,7 +1059,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1096,9 +1072,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1115,9 +1089,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1147,7 +1119,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1165,7 +1137,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1183,7 +1155,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1197,7 +1169,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1210,9 +1182,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1229,9 +1199,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1261,7 +1229,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1279,7 +1247,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1297,7 +1265,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1311,7 +1279,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1324,9 +1292,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1343,9 +1309,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1375,7 +1339,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1400,7 +1364,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1425,7 +1389,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-FLUSH-LABEL: mad_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1441,7 +1405,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1457,7 +1421,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1472,9 +1436,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-FLUSH-LABEL: mad_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1493,9 +1455,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1514,9 +1474,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1550,7 +1508,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1575,7 +1533,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1600,7 +1558,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: mad_sub_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1616,7 +1574,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1632,7 +1590,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1647,9 +1605,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-FLUSH-LABEL: mad_sub_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1668,9 +1624,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1689,9 +1643,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1725,7 +1677,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1750,7 +1702,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1775,7 +1727,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1791,7 +1743,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1807,7 +1759,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1822,9 +1774,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1843,9 +1793,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1864,9 +1812,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1901,7 +1847,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1926,7 +1872,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1951,7 +1897,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1967,7 +1913,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1983,7 +1929,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1998,9 +1944,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2019,9 +1963,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2040,9 +1982,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2077,7 +2017,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: neg_neg_mad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2102,7 +2042,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2127,7 +2067,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: neg_neg_mad_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2143,7 +2083,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2159,7 +2099,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2174,9 +2114,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-FLUSH-LABEL: neg_neg_mad_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2195,9 +2133,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2216,9 +2152,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2254,7 +2188,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_fabs_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2279,7 +2213,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2304,7 +2238,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2320,7 +2254,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2336,7 +2270,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2351,9 +2285,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2372,9 +2304,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2393,9 +2323,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2430,7 +2358,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2448,7 +2376,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2466,7 +2394,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2480,7 +2408,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2494,7 +2422,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2507,9 +2435,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2526,9 +2452,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2545,9 +2469,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2577,7 +2499,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2595,7 +2517,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2613,7 +2535,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2627,7 +2549,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2641,7 +2563,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2654,9 +2576,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2673,9 +2593,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2692,9 +2610,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index ba8b6fb80518fc..f411a76e75ab69 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -15,8 +15,8 @@ declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; SI-LABEL: fnearbyint_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,24 +28,23 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; ; CI-LABEL: fnearbyint_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0xb -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_rndne_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fnearbyint_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f16_e32 v2, s4 +; VI-NEXT: v_rndne_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -54,11 +53,11 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; GFX11-LABEL: fnearbyint_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f16_e32 v1, s4 +; GFX11-NEXT: v_rndne_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -71,8 +70,8 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; SICI-LABEL: fnearbyint_f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dword s4, s[2:3], 0xb -; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SICI-NEXT: s_load_dword s4, s[0:1], 0xb +; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SICI-NEXT: s_mov_b32 s3, 0xf000 ; SICI-NEXT: s_mov_b32 s2, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -82,10 +81,10 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; ; VI-LABEL: fnearbyint_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f32_e32 v2, s4 +; VI-NEXT: v_rndne_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -94,11 +93,11 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; GFX11-LABEL: fnearbyint_f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f32_e32 v1, s4 +; GFX11-NEXT: v_rndne_f32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -112,7 +111,7 @@ entry: define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v2f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SICI-NEXT: s_mov_b32 s7, 0xf000 ; SICI-NEXT: s_mov_b32 s6, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -125,7 +124,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fnearbyint_v2f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_rndne_f32_e32 v1, s3 @@ -136,7 +135,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; GFX11-LABEL: fnearbyint_v2f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v1, s3 @@ -154,8 +153,8 @@ entry: define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v4f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SICI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SICI-NEXT: s_mov_b32 s3, 0xf000 ; SICI-NEXT: s_mov_b32 s2, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -168,8 +167,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; ; VI-LABEL: fnearbyint_v4f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -183,8 +182,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; GFX11-LABEL: fnearbyint_v4f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v3, s7 @@ -204,7 +203,7 @@ entry: define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; SI-LABEL: nearbyint_f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_brev_b32 s8, -2 @@ -228,7 +227,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; CI-LABEL: nearbyint_f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s3, 0xf000 @@ -238,7 +237,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: nearbyint_f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -248,7 +247,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; GFX11-LABEL: nearbyint_f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] @@ -264,41 +263,41 @@ entry: define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: nearbyint_v2f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_brev_b32 s10, -2 ; SI-NEXT: v_mov_b32_e32 v6, 0x43300000 ; SI-NEXT: s_mov_b32 s9, 0x432fffff ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v7, s3 ; SI-NEXT: v_bfi_b32 v1, s10, v6, v7 -; SI-NEXT: v_mov_b32_e32 v8, s6 -; SI-NEXT: v_mov_b32_e32 v9, s5 -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1] +; SI-NEXT: v_mov_b32_e32 v8, s2 +; SI-NEXT: v_mov_b32_e32 v9, s1 +; SI-NEXT: v_mov_b32_e32 v10, s0 +; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1] ; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1] ; SI-NEXT: v_bfi_b32 v1, s10, v6, v9 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1] +; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1] ; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: nearbyint_v2f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -309,8 +308,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; ; VI-LABEL: nearbyint_v2f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] ; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] @@ -322,8 +321,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; GFX11-LABEL: nearbyint_v2f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] @@ -341,8 +340,8 @@ entry: define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: nearbyint_v4f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_brev_b32 s14, -2 @@ -391,8 +390,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; CI-LABEL: nearbyint_v4f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -406,8 +405,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; VI-LABEL: nearbyint_v4f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] ; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] @@ -426,8 +425,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; GFX11-LABEL: nearbyint_v4f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 74e2b9ea714258..b5440b9c38c9f2 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2799,7 +2799,7 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -2813,7 +2813,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fneg_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3016,41 +3016,41 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s4, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_select_infloop_regression_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s4, 0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; VI-NEXT: s_cselect_b32 s0, 0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cselect_b32 s2, 0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %i = select i1 %arg1, double 0.0, double %arg @@ -3080,11 +3080,11 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_bitcmp1_b32 s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_bitcmp1_b32 s2, 16 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3] @@ -3096,11 +3096,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a ; ; VI-LABEL: s_fneg_select_infloop_regression_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s4, 16 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_bitcmp1_b32 s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 @@ -3146,7 +3146,7 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s1, 1, s1 ; SI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3161,7 +3161,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, 1, s1 ; VI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3216,8 +3216,8 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s6, 0 @@ -3235,8 +3235,8 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s6, 0 @@ -3279,7 +3279,7 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fabs_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3293,7 +3293,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fabs_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3329,7 +3329,7 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_fabs_select_infloop_regression: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3343,7 +3343,7 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 ; ; VI-LABEL: s_fneg_fabs_select_infloop_regression: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 8267bb9f5450f8..4364b32e62f8c9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -7,12 +7,12 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -23,8 +23,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -36,8 +36,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fadd_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -49,13 +49,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_sub_f16_e64 v1, s2, |s4| +; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -70,13 +70,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fmul_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s1, s0, 0x7fff ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s1| -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -87,8 +87,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fmul_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -100,8 +100,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fmul_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -113,13 +113,13 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fmul_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, s2, -|s4| +; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -137,8 +137,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: fneg_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -149,8 +149,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: fneg_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -161,8 +161,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: fneg_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -173,10 +173,10 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: fneg_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -193,8 +193,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: fneg_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -205,8 +205,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: fneg_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -217,8 +217,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -229,10 +229,10 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fneg_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -248,7 +248,7 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CIVI-LABEL: v_fneg_fabs_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -262,7 +262,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -273,7 +273,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_fneg_fabs_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -293,12 +293,12 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_add_f32_e32 v1, 2.0, v1 ; CI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -314,8 +314,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; VI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -331,8 +331,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; GFX9-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -344,11 +344,11 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; GFX11-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s4 +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -367,8 +367,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -379,8 +379,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: s_fneg_fabs_v2f16_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -391,8 +391,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -403,10 +403,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -422,7 +422,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CIVI-LABEL: fneg_fabs_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -435,7 +435,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX9-LABEL: fneg_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -447,7 +447,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX11-LABEL: fneg_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -467,12 +467,12 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: fold_user_fneg_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 ; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -487,8 +487,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fold_user_fneg_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -503,8 +503,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: fold_user_fneg_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -515,11 +515,11 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: fold_user_fneg_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -536,8 +536,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -553,8 +553,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; VI-LABEL: s_fneg_multi_use_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -570,11 +570,11 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; GFX9-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff ; GFX9-NEXT: s_xor_b32 s5, s4, 0x80008000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -585,8 +585,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; GFX11-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -609,8 +609,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010 @@ -633,8 +633,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; VI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -654,11 +654,11 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; GFX9-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_pk_mul_f16 v1, s4, -4.0 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] @@ -668,8 +668,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll index d0115523b18823..2c9042ec17da88 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, double %y) { ; SI-LABEL: fneg_fabs_fadd_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -20,8 +20,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, ; ; VI-LABEL: fneg_fabs_fadd_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -40,7 +40,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrspace(1) %xptr, ptr addrspace(1) %yptr) { ; SI-LABEL: v_fneg_fabs_fadd_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -52,7 +52,7 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: v_fneg_fabs_fadd_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -73,8 +73,8 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, double %y) { ; SI-LABEL: fneg_fabs_fmul_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -88,8 +88,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, ; ; VI-LABEL: fneg_fabs_fmul_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -108,7 +108,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: fneg_fabs_free_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s3, 31 @@ -122,7 +122,7 @@ define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: fneg_fabs_free_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_or_b32 s0, s3, 0x80000000 @@ -174,8 +174,8 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %in) { ; SI-LABEL: fneg_fabs_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s5, 31 @@ -187,14 +187,14 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl ; ; VI-LABEL: fneg_fabs_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset1_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %in) @@ -206,8 +206,8 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: fneg_fabs_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s7, 31 @@ -222,8 +222,8 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> % ; ; VI-LABEL: fneg_fabs_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s7, 0x80000000 ; VI-NEXT: s_or_b32 s3, s5, 0x80000000 @@ -244,8 +244,8 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> % define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: fneg_fabs_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -267,8 +267,8 @@ define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; VI-LABEL: fneg_fabs_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s7, 31 ; VI-NEXT: s_bitset1_b32 s5, 31 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 6446145bbfe2ad..3c000d4fa63a38 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fadd_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fadd_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_sub_f32_e64 v2, s3, |v0| @@ -36,7 +36,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fmul_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fmul_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0| @@ -67,11 +67,11 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_fabsf_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s2, 0x80000000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -79,10 +79,10 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fneg_fabsf_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -129,11 +129,11 @@ define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s2, 0x80000000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -141,10 +141,10 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -159,7 +159,7 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: v_fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fneg_fabsf_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s3, 31 @@ -213,7 +213,7 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fneg_fabsf_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s3, 31 ; VI-NEXT: s_bitset1_b32 s2, 31 @@ -232,8 +232,8 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fneg_fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s7, 31 @@ -250,8 +250,8 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> % ; ; VI-LABEL: fneg_fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s7, 0x80000000 ; VI-NEXT: s_or_b32 s3, s6, 0x80000000 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 63ccaafeda88f4..cd1ec85eb8d0f3 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1475,11 +1475,11 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_fo define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i1 %z, ptr addrspace(1) %dst) { ; GFX7-LABEL: multiple_uses_fneg_select_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s8, s[6:7], 0x4 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x6 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x4 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bitcmp1_b32 s8, 0 +; GFX7-NEXT: s_bitcmp1_b32 s6, 0 ; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX7-NEXT: v_mov_b32_e32 v0, s3 @@ -1497,12 +1497,12 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; ; GFX9-LABEL: multiple_uses_fneg_select_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s8, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x18 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bitcmp1_b32 s8, 0 +; GFX9-NEXT: s_bitcmp1_b32 s6, 0 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -1519,13 +1519,13 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX11-LABEL: multiple_uses_fneg_select_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s8, s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x18 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x18 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s5 -; GFX11-NEXT: s_bitcmp1_b32 s8, 0 +; GFX11-NEXT: s_bitcmp1_b32 s2, 0 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, s7, v0, vcc_lo @@ -1549,7 +1549,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) { ; GCN-LABEL: fnge_select_f32_multi_use_regression: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -1562,7 +1562,7 @@ define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) { ; ; GFX11-LABEL: fnge_select_f32_multi_use_regression: ; GFX11: ; %bb.0: ; %.entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, s0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 40982347f3ca00..31c1389c940208 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; CI-LABEL: s_fneg_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -20,8 +20,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; ; GFX8-LABEL: s_fneg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -32,8 +32,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; ; GFX9-LABEL: s_fneg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 @@ -44,10 +44,10 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; GFX11-LABEL: s_fneg_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -64,7 +64,7 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX8-LABEL: v_fneg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -92,7 +92,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_fneg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -103,9 +103,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX11-LABEL: v_fneg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -127,8 +125,8 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; CI-LABEL: s_fneg_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -139,8 +137,8 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; ; GFX8-LABEL: s_fneg_free_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -151,8 +149,8 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; ; GFX9-LABEL: s_fneg_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 @@ -163,10 +161,10 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; GFX11-LABEL: s_fneg_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -182,7 +180,7 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_fold_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -199,7 +197,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_fneg_fold_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -213,7 +211,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_fneg_fold_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -224,7 +222,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_fneg_fold_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -244,8 +242,8 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: s_fneg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -256,8 +254,8 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; ; GFX8-LABEL: s_fneg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -268,8 +266,8 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; ; GFX9-LABEL: s_fneg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 @@ -280,10 +278,10 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; GFX11-LABEL: s_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -298,7 +296,7 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; CIVI-LABEL: s_fneg_v2f16_nonload: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CIVI-NEXT: ;;#ASMSTART ; CIVI-NEXT: ; def s2 ; CIVI-NEXT: ;;#ASMEND @@ -312,7 +310,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: s_fneg_v2f16_nonload: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s2 ; GFX9-NEXT: ;;#ASMEND @@ -325,7 +323,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: s_fneg_v2f16_nonload: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s2 ; GFX11-NEXT: ;;#ASMEND @@ -347,7 +345,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -361,7 +359,7 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_fneg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -375,7 +373,7 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_fneg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -386,9 +384,7 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_fneg_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -410,8 +406,8 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-LABEL: fneg_free_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -422,8 +418,8 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX8-LABEL: fneg_free_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -434,8 +430,8 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX9-LABEL: fneg_free_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 @@ -446,10 +442,10 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-LABEL: fneg_free_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -465,7 +461,7 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -491,7 +487,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: v_fneg_fold_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -507,7 +503,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_fneg_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -518,7 +514,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: v_fneg_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -538,7 +534,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fneg_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -559,7 +555,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX8-LABEL: v_extract_fneg_fold_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -576,7 +572,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX9-LABEL: v_extract_fneg_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -592,7 +588,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX11-LABEL: v_extract_fneg_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] @@ -623,7 +619,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 { ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -639,7 +635,7 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX9-LABEL: v_extract_fneg_no_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] @@ -653,7 +649,7 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX11-LABEL: v_extract_fneg_no_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index e447429539e6ff..d78bdfe08772a4 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,10 +19,10 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: s_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,10 +32,10 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: s_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -50,7 +50,7 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) { ; SI-LABEL: s_fneg_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -65,7 +65,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; VI-LABEL: s_fneg_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 @@ -78,7 +78,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; GFX11-LABEL: s_fneg_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 @@ -97,8 +97,8 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) { ; SI-LABEL: s_fneg_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -115,8 +115,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; ; VI-LABEL: s_fneg_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s7, 0x80000000 ; VI-NEXT: s_xor_b32 s3, s6, 0x80000000 @@ -134,8 +134,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; GFX11-LABEL: s_fneg_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000 @@ -157,8 +157,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fsub0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -168,10 +168,10 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fsub0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 0, s4 +; VI-NEXT: v_sub_f32_e64 v2, 0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -180,11 +180,11 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fsub0_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 0, s4 +; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -198,8 +198,8 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,10 +210,10 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fneg_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -223,10 +223,10 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fneg_free_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -242,8 +242,8 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fold_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -253,10 +253,10 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fneg_fold_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, -s4, s4 +; VI-NEXT: v_mul_f32_e64 v2, -s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -265,11 +265,11 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: fneg_fold_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, -s4, s4 +; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -284,8 +284,8 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -295,10 +295,10 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; ; VI-LABEL: bitpreserve_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, s4, -4.0 +; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -307,11 +307,11 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; GFX11-LABEL: bitpreserve_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, s4, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -327,8 +327,8 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -339,10 +339,10 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fneg_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -352,10 +352,10 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -380,8 +380,8 @@ define i32 @v_fneg_i32(i32 %in) { define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -391,10 +391,10 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fneg_i32_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 2.0, s4 +; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -403,11 +403,11 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s4 +; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -434,7 +434,7 @@ define float @v_fneg_i32_fp_use(i32 %in) { define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -448,7 +448,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 @@ -460,7 +460,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -488,7 +488,7 @@ define i64 @v_fneg_i64(i64 %in) { define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -500,7 +500,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -510,7 +510,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64_fp_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 @@ -550,24 +550,23 @@ define i16 @v_fneg_i16(i16 %in) { define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; SI-LABEL: s_fneg_i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f16_e64 v2, 2.0, s4 +; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -576,11 +575,11 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fneg_i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s4 +; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -620,8 +619,8 @@ define half @v_fneg_i16_fp_use(i16 %in) { define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -632,15 +631,15 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; ; VI-LABEL: s_fneg_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: s_xor_b32 s3, s4, 0x8000 +; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_xor_b32 s3, s3, 0x8000 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -650,10 +649,10 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; GFX11-LABEL: s_fneg_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -696,35 +695,34 @@ define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) { define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v2i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_xor_b32 s3, s3, 0x8000 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_xor_b32 s3, s4, 0x8000 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_add_f16_e64 v1, s3, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_f16_e64 v1, s2, 2.0 ; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -735,11 +733,11 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ; GFX11-LABEL: s_fneg_v2i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, s4, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 65046681ffc208..37a201e390f81f 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -8,123 +8,13 @@ declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i3 declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) -declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) -declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) -define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret void -} - -define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret void -} - -define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret <2 x half> %ret -} - -define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -133,7 +23,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -172,7 +62,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -181,7 +71,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -220,7 +110,7 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] @@ -230,7 +120,7 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr ; ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index cdfc8f48349f62..0746b93546124c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -14,17 +14,17 @@ declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> % define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -37,7 +37,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -49,7 +49,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_mov_b32 s0, 0 @@ -76,7 +76,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -88,7 +88,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_mov_b32 s0, 0 @@ -180,17 +180,17 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -225,17 +225,17 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -270,17 +270,17 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[2:3] ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] @@ -316,7 +316,7 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -326,7 +326,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; ; GFX12-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: ds_pk_add_f16 v0, v1 @@ -364,7 +364,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -374,7 +374,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; ; GFX12-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 @@ -409,4 +409,298 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ret <2 x i16> %ret } +define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 1023 + %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret float %result +} + +define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 -256 + %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret float %result +} + +define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 1023 + %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret void +} + +define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-1024 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 -256 + %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret void +} + +define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 + %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret <2 x half> %result +} + +define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 + %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret <2 x half> %result +} + +define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 + %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret void +} + +define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-1024 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 + %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret void +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 + %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret <2 x i16> %result +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 + %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret <2 x i16> %result +} + +define void @flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 + %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret void +} + +define void @flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-1024 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 + %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret void +} + attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index fb731cc00d3f01..18d2e52e8f9002 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -9,24 +9,24 @@ declare double @llvm.fabs.f64(double) #1 define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isinf_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x204 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isinf_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -36,11 +36,11 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f ; GFX11-LABEL: test_isinf_pattern: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -57,24 +57,24 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_nlg_f32_e64 s[4:5], |s4|, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cmp_nlg_f32_e64 s[0:1], |s0|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_not_isinf_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s4|, v0 +; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -84,11 +84,11 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % ; GFX11-LABEL: test_not_isinf_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s4| +; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -105,7 +105,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -115,7 +115,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; VI-LABEL: test_not_isinf_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; GFX11-LABEL: test_not_isinf_pattern_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -142,24 +142,24 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -169,11 +169,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -192,24 +192,24 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -219,11 +219,11 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -241,8 +241,8 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -253,10 +253,10 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -266,11 +266,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -290,23 +290,23 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_o_f32_e64 s[4:5], s6, s6 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 @@ -321,14 +321,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 -; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -346,7 +346,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocapture %out, float %x, float %y) #0 { ; SI-LABEL: test_isfinite_not_pattern_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 @@ -362,7 +362,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2 @@ -376,7 +376,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; GFX11-LABEL: test_isfinite_not_pattern_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 @@ -401,23 +401,23 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_u_f32_e64 s[4:5], s6, s6 -; SI-NEXT: v_cmp_neq_f32_e64 s[6:7], |s6|, v0 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cmp_u_f32_e64 s[0:1], s2, s2 +; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s2|, v0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4 @@ -432,14 +432,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s2, s4, s4 -; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s4| +; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -457,24 +457,24 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -484,11 +484,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -507,24 +507,24 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_commute_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4_commute_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -534,11 +534,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) ; GFX11-LABEL: test_isfinite_pattern_4_commute_and: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -557,16 +557,16 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrspace(1) nocapture %out, float %x, [8 x i32], float %y) #0 { ; SI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: s_load_dword s1, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_o_f32_e32 vcc, s1, v1 -; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s1, v0 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_cmp_o_f32_e32 vcc, s0, v1 +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s0, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -574,14 +574,14 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; ; VI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x50 -; VI-NEXT: s_load_dword s1, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x50 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_cmp_class_f32_e32 vcc, s1, v0 -; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s1, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s5, v0 +; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s5, v1 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] @@ -592,15 +592,15 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; GFX11-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x50 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x50 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s3, s4, 0x1f8 -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s5 +; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -618,8 +618,8 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isinf_pattern_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -632,11 +632,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; ; VI-LABEL: test_isinf_pattern_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -646,11 +646,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; GFX11-LABEL: test_isinf_pattern_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x204 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -667,8 +667,8 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -684,11 +684,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -698,11 +698,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -721,8 +721,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -738,11 +738,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_4_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -752,11 +752,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_4_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll index 105d9246880a49..587340c7aa342c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -20,8 +20,8 @@ declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32 define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -30,8 +30,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,19 +41,19 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -63,8 +63,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -75,8 +75,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX12-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen @@ -86,8 +86,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -96,8 +96,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -107,19 +107,19 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -129,8 +129,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -242,15 +242,14 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xf ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; SI-NEXT: s_load_dword s0, s[2:3], 0xf -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 @@ -258,48 +257,60 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_clause 0x2 +; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v1, s6 +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b32 v1, v0 ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_clause 0x2 +; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc -; GFX1100-NEXT: v_mov_b32_e32 v1, s6 +; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: ds_store_b32 v1, v0 ; GFX1100-NEXT: s_endpgm @@ -307,8 +318,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s4, 4 @@ -320,15 +331,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: v_mov_b32_e32 v0, s2 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf -; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b32 v1, v0 @@ -336,15 +346,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: ds_write_b32 v1, v0 @@ -353,12 +362,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -368,15 +377,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_clause 0x2 +; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b32 v1, v0 @@ -384,14 +392,13 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_clause 0x2 +; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc -; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: ds_store_b32 v1, v0 @@ -405,8 +412,8 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -415,8 +422,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -426,19 +433,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -448,8 +455,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -460,8 +467,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX12-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen @@ -471,8 +478,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -481,8 +488,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -492,19 +499,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -514,8 +521,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -627,7 +634,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -642,7 +649,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -657,7 +664,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -669,7 +676,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -681,7 +688,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -694,7 +701,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s4, 4 @@ -708,7 +715,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -722,7 +729,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -736,7 +743,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -748,7 +755,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -760,7 +767,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll index e124aadf4e8c23..e3ed0fa4918845 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll @@ -18,8 +18,8 @@ declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float, ptr addrspace(8 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,8 +28,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -39,19 +39,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -61,8 +61,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -72,8 +72,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -82,8 +82,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -93,19 +93,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -115,8 +115,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -219,15 +219,14 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xf ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; SI-NEXT: s_load_dword s0, s[2:3], 0xf -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 @@ -235,63 +234,74 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_clause 0x2 +; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v1, s6 +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b32 v1, v0 ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_clause 0x2 +; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc -; GFX1100-NEXT: v_mov_b32_e32 v1, s6 +; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: ds_store_b32 v1, v0 ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: v_mov_b32_e32 v0, s2 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf -; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b32 v1, v0 @@ -299,15 +309,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: ds_write_b32 v1, v0 @@ -316,12 +325,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -331,15 +340,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_clause 0x2 +; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b32 v1, v0 @@ -347,14 +355,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_clause 0x2 +; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc -; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: ds_store_b32 v1, v0 @@ -369,8 +376,8 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -379,8 +386,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -390,19 +397,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -412,8 +419,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -423,8 +430,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -433,8 +440,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -444,19 +451,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -466,8 +473,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -570,7 +577,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -585,7 +592,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -600,7 +607,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -612,7 +619,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -624,7 +631,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -637,7 +644,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -651,7 +658,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -665,7 +672,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -677,7 +684,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -689,7 +696,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll index 81859dce04889d..d827ea0503a3b2 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -10,7 +10,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp16_to_fp32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll index c17be87834aeb7..03b8251ea4640d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -8,7 +8,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp16_to_fp64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll index d8a726f251a01e..8ab82b722445e0 100644 --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -9,7 +9,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp32_to_fp16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp32_to_fp16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -45,7 +45,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp32_to_fp16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index ce1fcccf4a17c8..5690b99e43ece4 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -26,22 +26,22 @@ declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -73,12 +73,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -88,12 +88,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -109,22 +109,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -156,12 +156,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -171,12 +171,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -192,22 +192,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -239,12 +239,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -254,12 +254,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -275,22 +275,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -322,12 +322,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -337,12 +337,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -358,22 +358,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -405,12 +405,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -420,12 +420,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -441,22 +441,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -488,12 +488,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -503,12 +503,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -524,22 +524,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -571,12 +571,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -586,12 +586,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -607,22 +607,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -654,12 +654,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -669,12 +669,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -690,22 +690,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -737,12 +737,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -752,12 +752,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -773,22 +773,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -820,12 +820,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -835,12 +835,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -856,22 +856,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -903,12 +903,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -918,12 +918,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -939,22 +939,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -986,12 +986,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -1001,12 +1001,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -1022,7 +1022,7 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s2 @@ -1047,7 +1047,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 @@ -1057,7 +1057,7 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s2 @@ -1072,7 +1072,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s2 @@ -1097,23 +1097,23 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB39_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -1132,21 +1132,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB39_2: @@ -1159,20 +1159,20 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB40_2: @@ -1180,21 +1180,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB40_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB40_2: @@ -1207,23 +1207,23 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB41_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -1242,21 +1242,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB41_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB41_2: @@ -1269,20 +1269,20 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB42_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB42_2: @@ -1290,21 +1290,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB42_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB42_2: @@ -1479,23 +1479,23 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB49_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -1512,21 +1512,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB49_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB49_2: @@ -1539,7 +1539,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1564,7 +1564,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1581,7 +1581,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1593,7 +1593,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1610,7 +1610,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1636,7 +1636,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1760,7 +1760,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 @@ -1771,7 +1771,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -1806,7 +1806,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1846,7 +1846,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 @@ -1857,7 +1857,7 @@ define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -1892,7 +1892,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 @@ -1903,7 +1903,7 @@ define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -1938,16 +1938,16 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) { ; GFX90A-LABEL: local_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB63_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] @@ -1959,16 +1959,16 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do ; ; GFX940-LABEL: local_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB63_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX940-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX940-NEXT: s_load_dword s6, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] @@ -2008,21 +2008,21 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_endpgm main_body: @@ -2056,19 +2056,19 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB67_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB67_2: @@ -2076,19 +2076,19 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB67_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB67_2: @@ -2101,19 +2101,19 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB68_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB68_2: @@ -2121,19 +2121,19 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB68_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB68_2: @@ -2146,19 +2146,19 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB69_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB69_2: @@ -2166,19 +2166,19 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB69_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB69_2: @@ -2256,6 +2256,264 @@ main_body: ret double %ret } +define double @flat_atomic_fadd_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) + ret double %ret +} + +define double @flat_atomic_fadd_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data) + ret double %ret +} + +define void @flat_atomic_fadd_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) + ret void +} + +define void @flat_atomic_fadd_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data) + ret void +} + +define double @flat_atomic_fmin_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) + ret double %ret +} + +define double @flat_atomic_fmin_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %gep, double %data) + ret double %ret +} + +define void @flat_atomic_fmin_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_noret__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %unused = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) + ret void +} + +define void @flat_atomic_fmin_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_noret__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %unused = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %gep, double %data) + ret void +} + +define double @flat_atomic_fmax_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) + ret double %ret +} + +define double @flat_atomic_fmax_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %gep, double %data) + ret double %ret +} + +define void @flat_atomic_fmax_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_noret__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %unused = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) + ret void +} + +define void @flat_atomic_fmax_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_noret__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %unused = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %gep, double %data) + ret void +} + attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { "denormal-fp-math"="ieee,ieee" "amdgpu-unsafe-fp-atomics"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll index f18f5752269e00..d610091840b958 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,12 +41,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -54,9 +54,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -91,12 +91,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -104,9 +104,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -253,9 +253,9 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -278,12 +278,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -291,9 +291,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -328,12 +328,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -341,9 +341,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,7 +452,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -465,7 +465,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,7 +506,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -519,7 +519,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll index 6a2a8c3ce595d7..5f501fec24c2e4 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,12 +41,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -54,9 +54,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -91,12 +91,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -104,9 +104,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -253,9 +253,9 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -278,12 +278,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -291,9 +291,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -328,12 +328,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -341,9 +341,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,7 +452,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -465,7 +465,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,7 +506,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -519,7 +519,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index 3571f3545ad1a1..04ef30bd26aa51 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -8,8 +8,8 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,12 +19,12 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fp_to_sint_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,8 +47,8 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32_fabs: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -58,12 +58,12 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) ; ; VI-LABEL: fp_to_sint_i32_fabs: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e64 v0, |s4| +; VI-NEXT: v_cvt_i32_f32_e64 v0, |s2| +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -87,7 +87,7 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_sint_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -132,7 +132,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_sint_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -147,7 +147,7 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: fp_to_sint_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -193,37 +193,37 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s5, 0x2f800000 -; SI-NEXT: s_mov_b32 s6, 0xcf800000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s1, 0x2f800000 +; SI-NEXT: s_mov_b32 s2, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s4 -; SI-NEXT: v_mul_f32_e64 v1, |v0|, s5 +; SI-NEXT: v_trunc_f32_e32 v0, s0 +; SI-NEXT: v_mul_f32_e64 v1, |v0|, s1 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; SI-NEXT: v_floor_f32_e32 v1, v1 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v1 -; SI-NEXT: v_fma_f32 v0, v1, s6, |v0| +; SI-NEXT: v_fma_f32 v0, v1, s2, |v0| ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_xor_b32_e32 v1, v3, v2 ; SI-NEXT: v_xor_b32_e32 v0, v0, v2 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s2, 0x2f800000 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s4, 0x2f800000 ; VI-NEXT: s_mov_b32 s5, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s4 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s2 +; VI-NEXT: v_trunc_f32_e32 v0, s2 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s4 ; VI-NEXT: v_floor_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v2, v1, s5, |v0| ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -294,7 +294,7 @@ entry: define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 @@ -329,7 +329,7 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -452,17 +452,17 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_sint_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 ; SI-NEXT: s_mov_b32 s9, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s5 -; SI-NEXT: v_trunc_f32_e32 v1, s4 -; SI-NEXT: v_trunc_f32_e32 v2, s7 -; SI-NEXT: v_trunc_f32_e32 v3, s6 +; SI-NEXT: v_trunc_f32_e32 v0, s1 +; SI-NEXT: v_trunc_f32_e32 v1, s0 +; SI-NEXT: v_trunc_f32_e32 v2, s3 +; SI-NEXT: v_trunc_f32_e32 v3, s2 ; SI-NEXT: v_mul_f32_e64 v4, |v0|, s8 ; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; SI-NEXT: v_mul_f32_e64 v6, |v1|, s8 @@ -503,14 +503,14 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % ; SI-NEXT: v_subb_u32_e32 v7, vcc, v12, v9, vcc ; SI-NEXT: v_sub_i32_e32 v4, vcc, v13, v11 ; SI-NEXT: v_subb_u32_e32 v5, vcc, v8, v11, vcc -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s9, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -737,8 +737,8 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -749,8 +749,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -787,8 +787,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -799,8 +799,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -838,8 +838,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_sint_f32_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -849,12 +849,12 @@ define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) ; ; VI-LABEL: fp_to_sint_f32_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll index c6b4e129bacbe2..5abf82aa1aab59 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -8,8 +8,8 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_uint_f32_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,12 +19,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float % ; ; VI-LABEL: fp_to_uint_f32_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,7 +47,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float % define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -60,7 +60,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,7 +92,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -152,34 +152,34 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x) { ; SI-LABEL: fp_to_uint_f32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s5, 0xcf800000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s1, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s4 +; SI-NEXT: v_trunc_f32_e32 v0, s0 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_floor_f32_e32 v2, v1 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v2 -; SI-NEXT: v_fma_f32 v0, v2, s5, v0 +; SI-NEXT: v_fma_f32 v0, v2, s1, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_f32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s2, 0xcf800000 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xcf800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s4 +; VI-NEXT: v_trunc_f32_e32 v0, s2 ; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; VI-NEXT: v_floor_f32_e32 v2, v1 -; VI-NEXT: v_fma_f32 v0, v2, s2, v0 +; VI-NEXT: v_fma_f32 v0, v2, s3, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -240,7 +240,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 @@ -264,7 +264,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -376,16 +376,16 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s5 -; SI-NEXT: v_trunc_f32_e32 v2, s4 -; SI-NEXT: v_trunc_f32_e32 v4, s7 -; SI-NEXT: v_trunc_f32_e32 v6, s6 +; SI-NEXT: v_trunc_f32_e32 v0, s1 +; SI-NEXT: v_trunc_f32_e32 v2, s0 +; SI-NEXT: v_trunc_f32_e32 v4, s3 +; SI-NEXT: v_trunc_f32_e32 v6, s2 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 @@ -406,14 +406,14 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x ; SI-NEXT: v_cvt_u32_f32_e32 v0, v8 ; SI-NEXT: v_cvt_u32_f32_e32 v6, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v4, v9 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s2, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -619,8 +619,8 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -631,8 +631,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -669,8 +669,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -681,8 +681,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -720,8 +720,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -731,12 +731,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %i ; ; VI-LABEL: fp_to_uint_f32_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index 8c6dc4395839c0..82c25c01b17792 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; SI-LABEL: fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; ; GFX89-LABEL: fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; ; GFX11-LABEL: fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @fpext_f16_to_f64( ; SI-LABEL: fpext_f16_to_f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; ; GFX89-LABEL: fpext_f16_to_f64: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; ; GFX11-LABEL: fpext_f16_to_f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -141,7 +141,7 @@ entry: define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; SI-LABEL: fpext_v2f16_to_v2f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -161,7 +161,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; ; GFX89-LABEL: fpext_v2f16_to_v2f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; ; GFX11-LABEL: fpext_v2f16_to_v2f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -212,7 +212,7 @@ entry: define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; SI-LABEL: fpext_v2f16_to_v2f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -234,7 +234,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; ; GFX89-LABEL: fpext_v2f16_to_v2f64: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -255,7 +255,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; ; GFX11-LABEL: fpext_v2f16_to_v2f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -290,35 +290,46 @@ entry: define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) { ; SI-LABEL: s_fneg_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: s_fneg_fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: s_fneg_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -334,7 +345,7 @@ entry: define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; SI-LABEL: fneg_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -352,7 +363,7 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -370,7 +381,7 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -400,7 +411,7 @@ entry: define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; SI-LABEL: fabs_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -418,7 +429,7 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -436,7 +447,7 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -466,7 +477,7 @@ entry: define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; SI-LABEL: fneg_fabs_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -484,7 +495,7 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -502,7 +513,7 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -535,7 +546,7 @@ entry: define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; SI-LABEL: fneg_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -557,7 +568,7 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -579,7 +590,7 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -614,7 +625,7 @@ entry: define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -638,7 +649,7 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -660,7 +671,7 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -696,7 +707,7 @@ entry: define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; SI-LABEL: fabs_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -718,7 +729,7 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -740,7 +751,7 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -775,7 +786,7 @@ entry: define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -799,7 +810,7 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -821,7 +832,7 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -857,7 +868,7 @@ entry: define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; SI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -879,7 +890,7 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -901,7 +912,7 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -937,7 +948,7 @@ entry: define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -961,7 +972,7 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -983,7 +994,7 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1020,6 +1031,3 @@ entry: declare half @llvm.fabs.f16(half) #1 attributes #1 = { nounwind readnone } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX9: {{.*}} -; VI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index 0e12cca1900ce6..238010ec05e4db 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; SI-LABEL: fptosi_f16_to_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; ; VI-LABEL: fptosi_f16_to_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; ; GFX11-LABEL: fptosi_f16_to_i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i32( ; SI-LABEL: fptosi_f16_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; ; VI-LABEL: fptosi_f16_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; ; GFX11-LABEL: fptosi_f16_to_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -144,7 +144,7 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i64( ; SI-LABEL: fptosi_f16_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -164,7 +164,7 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; ; VI-LABEL: fptosi_f16_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -184,7 +184,7 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; ; GFX11-LABEL: fptosi_f16_to_i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -216,7 +216,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; SI-LABEL: fptosi_v2f16_to_v2i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -241,7 +241,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; ; VI-LABEL: fptosi_v2f16_to_v2i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -261,7 +261,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; ; GFX11-LABEL: fptosi_v2f16_to_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -296,7 +296,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; SI-LABEL: fptosi_v2f16_to_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -318,7 +318,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; ; VI-LABEL: fptosi_v2f16_to_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; ; GFX11-LABEL: fptosi_v2f16_to_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -377,7 +377,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; SI-LABEL: fptosi_v2f16_to_v2i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; ; VI-LABEL: fptosi_v2f16_to_v2i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -424,7 +424,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; ; GFX11-LABEL: fptosi_v2f16_to_v2i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -462,8 +462,8 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; SI-LABEL: fptosi_f16_to_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -475,8 +475,8 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: fptosi_f16_to_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -488,11 +488,11 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fptosi_f16_to_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s4 +; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index abc5c7af13b0ce..1116dc9ae2e5b2 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; SI-LABEL: fptoui_f16_to_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; ; VI-LABEL: fptoui_f16_to_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; ; GFX11-LABEL: fptoui_f16_to_i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i32( ; SI-LABEL: fptoui_f16_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; ; VI-LABEL: fptoui_f16_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; ; GFX11-LABEL: fptoui_f16_to_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -144,7 +144,7 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i64( ; SI-LABEL: fptoui_f16_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -164,7 +164,7 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; ; VI-LABEL: fptoui_f16_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -184,7 +184,7 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; ; GFX11-LABEL: fptoui_f16_to_i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -216,7 +216,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; SI-LABEL: fptoui_v2f16_to_v2i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -240,7 +240,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; ; VI-LABEL: fptoui_v2f16_to_v2i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -260,7 +260,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; ; GFX11-LABEL: fptoui_v2f16_to_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -295,7 +295,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; SI-LABEL: fptoui_v2f16_to_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -317,7 +317,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; ; VI-LABEL: fptoui_v2f16_to_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -338,7 +338,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; ; GFX11-LABEL: fptoui_v2f16_to_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -376,7 +376,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; SI-LABEL: fptoui_v2f16_to_v2i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -400,7 +400,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; ; VI-LABEL: fptoui_v2f16_to_v2i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -423,7 +423,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; ; GFX11-LABEL: fptoui_v2f16_to_v2i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -460,22 +460,21 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; SI-LABEL: fptoui_f16_to_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 1.0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fptoui_f16_to_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -487,11 +486,11 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fptoui_f16_to_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s4 +; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 65ac2e240469de..6cc7368eeae616 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -41,7 +41,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -59,7 +59,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -71,7 +71,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -89,7 +89,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -101,7 +101,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -121,7 +121,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -144,7 +144,7 @@ entry: define amdgpu_kernel void @fptrunc_f64_to_f16( ; SI-SDAG-LABEL: fptrunc_f64_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -163,7 +163,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; SI-GISEL-LABEL: fptrunc_f64_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -176,7 +176,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; VI-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -195,7 +195,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; VI-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -208,7 +208,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX9-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -227,7 +227,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX9-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -240,7 +240,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX11-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -262,7 +262,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX11-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -287,7 +287,7 @@ entry: define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; SI-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -308,7 +308,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; SI-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -323,7 +323,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; VI-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -343,7 +343,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; VI-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -358,7 +358,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -378,7 +378,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX9-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -392,7 +392,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -415,7 +415,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX11-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -441,7 +441,7 @@ entry: define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -464,7 +464,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -481,7 +481,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -503,7 +503,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -519,7 +519,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -541,7 +541,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 @@ -557,7 +557,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -584,7 +584,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 @@ -613,7 +613,7 @@ entry: define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -631,7 +631,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -643,7 +643,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -661,7 +661,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -673,7 +673,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -691,7 +691,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -703,7 +703,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -723,7 +723,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -747,7 +747,7 @@ entry: define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -765,7 +765,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -777,7 +777,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -795,7 +795,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -807,7 +807,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -825,7 +825,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -837,7 +837,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -857,7 +857,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -881,7 +881,7 @@ entry: define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -899,7 +899,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -911,7 +911,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -929,7 +929,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -941,7 +941,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -959,7 +959,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -971,7 +971,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -991,7 +991,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -1016,7 +1016,7 @@ entry: define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; SI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1034,7 +1034,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1076,7 +1076,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1106,7 +1106,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1128,7 +1128,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -1155,7 +1155,7 @@ entry: define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; SI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; SI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; VI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1203,7 +1203,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; VI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1215,7 +1215,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX9-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1245,7 +1245,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1267,7 +1267,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -1295,7 +1295,7 @@ entry: define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; SI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1314,7 +1314,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1327,7 +1327,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1359,7 +1359,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1378,7 +1378,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1391,7 +1391,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1413,7 +1413,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 1ba5e8f916cbaa..e4aa4d1d3ddb55 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; VI-SDAG-LABEL: fptrunc_f64_to_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -40,7 +40,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; VI-GISEL-LABEL: fptrunc_f64_to_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -50,7 +50,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX10-SDAG-LABEL: fptrunc_f64_to_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -60,7 +60,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX10-GISEL-LABEL: fptrunc_f64_to_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 @@ -70,7 +70,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX11-SDAG-LABEL: fptrunc_f64_to_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -82,7 +82,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX11-GISEL-LABEL: fptrunc_f64_to_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 @@ -99,7 +99,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -159,7 +159,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-SAFE-SDAG: ; %bb.0: -; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -218,7 +218,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-SAFE-GISEL: ; %bb.0: -; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; VI-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8 @@ -270,7 +270,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-UNSAFE-SDAG: ; %bb.0: -; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 @@ -281,7 +281,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-UNSAFE-GISEL: ; %bb.0: -; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -292,7 +292,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX10-SAFE-SDAG: ; %bb.0: -; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff ; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 @@ -348,7 +348,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX10-SAFE-GISEL: ; %bb.0: -; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 @@ -400,7 +400,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX10-UNSAFE-SDAG: ; %bb.0: -; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -411,7 +411,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX10-UNSAFE-GISEL: ; %bb.0: -; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -422,7 +422,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-SAFE-SDAG: ; %bb.0: -; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff ; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 @@ -489,7 +489,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-SAFE-GISEL: ; %bb.0: -; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 @@ -548,7 +548,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-UNSAFE-SDAG: ; %bb.0: -; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -562,7 +562,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-UNSAFE-GISEL: ; %bb.0: -; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -582,8 +582,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: fptrunc_v2f64_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -594,8 +594,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -606,8 +606,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; ; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -619,8 +619,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX10-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -632,8 +632,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX10-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -645,8 +645,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -660,8 +660,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -679,37 +679,37 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x double> %in) { ; SI-LABEL: fptrunc_v3f64_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; SI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; SI-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x54 -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] ; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -721,25 +721,24 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; ; GFX10-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; GFX10-SDAG-NEXT: s_clause 0x2 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] ; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -751,17 +750,16 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; ; GFX11-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x44 +; GFX11-SDAG-NEXT: s_clause 0x2 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x54 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -770,8 +768,8 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -790,8 +788,8 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: fptrunc_v4f64_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -804,8 +802,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; ; VI-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -818,8 +816,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; ; VI-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -833,8 +831,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX10-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -848,8 +846,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX10-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -863,8 +861,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX11-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -880,8 +878,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -901,8 +899,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x double> %in) { ; SI-LABEL: fptrunc_v8f64_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -920,8 +918,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; ; VI-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -939,8 +937,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; ; VI-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -959,8 +957,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX10-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -979,8 +977,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX10-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -999,8 +997,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX11-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1022,8 +1020,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 7c5d73ab66b47a..0d59021b69019f 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -51,8 +51,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -92,8 +92,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 8 @@ -120,12 +120,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 @@ -139,13 +139,13 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-LABEL: frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 @@ -159,8 +159,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-LABEL: frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -184,8 +184,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-LABEL: frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -218,8 +218,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -247,8 +247,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -276,8 +276,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 8 @@ -299,12 +299,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v3, v2 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -316,13 +316,13 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: fast_frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v3, v2 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -334,8 +334,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-LABEL: fast_frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -356,8 +356,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-LABEL: fast_frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -387,8 +387,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -416,8 +416,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -445,8 +445,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 8 @@ -468,12 +468,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v3, v2 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -485,13 +485,13 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX10-LABEL: unsafe_frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v3, v2 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -503,8 +503,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-LABEL: unsafe_frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -525,8 +525,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-LABEL: unsafe_frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -556,8 +556,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -592,8 +592,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -628,8 +628,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -662,12 +662,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 @@ -690,13 +690,13 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-LABEL: frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 @@ -719,8 +719,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-LABEL: frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -756,8 +756,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-LABEL: frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -802,8 +802,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -827,8 +827,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -852,8 +852,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -875,12 +875,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f32_e32 v3, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -892,13 +892,13 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: fast_frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -910,8 +910,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-LABEL: fast_frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -932,8 +932,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-LABEL: fast_frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -963,8 +963,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -988,8 +988,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1013,8 +1013,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1036,12 +1036,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f32_e32 v3, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -1053,13 +1053,13 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX10-LABEL: unsafe_frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -1071,8 +1071,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX11-LABEL: unsafe_frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1093,8 +1093,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-LABEL: unsafe_frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1124,8 +1124,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1182,8 +1182,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1217,8 +1217,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -1248,12 +1248,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -1275,13 +1275,13 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-LABEL: frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -1302,8 +1302,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-LABEL: frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1337,8 +1337,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-LABEL: frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v12, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1379,8 +1379,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1430,8 +1430,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1461,8 +1461,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -1488,12 +1488,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1511,13 +1511,13 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: fast_frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1535,8 +1535,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-LABEL: fast_frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1566,8 +1566,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-LABEL: fast_frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v10, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1604,8 +1604,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1655,8 +1655,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1686,8 +1686,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -1713,12 +1713,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1736,13 +1736,13 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX10-LABEL: unsafe_frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1760,8 +1760,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX11-LABEL: unsafe_frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1791,8 +1791,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-LABEL: unsafe_frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v10, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1829,8 +1829,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1892,8 +1892,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -1955,8 +1955,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1995,12 +1995,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 @@ -2023,13 +2023,13 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 @@ -2052,8 +2052,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2090,8 +2090,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v2f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -2139,8 +2139,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2242,8 +2242,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -2345,8 +2345,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -2405,12 +2405,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5 @@ -2448,13 +2448,13 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX10-NEXT: v_rcp_f32_e32 v5, v5 @@ -2492,8 +2492,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2552,8 +2552,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v4f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -2625,8 +2625,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2676,8 +2676,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -2727,8 +2727,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -2776,12 +2776,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 @@ -2819,13 +2819,13 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 @@ -2863,8 +2863,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2921,8 +2921,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v2f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -2989,8 +2989,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3070,8 +3070,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -3151,8 +3151,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 @@ -3230,12 +3230,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 ; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 @@ -3303,13 +3303,13 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3 ; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 @@ -3377,8 +3377,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -3477,8 +3477,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v4f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v8, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -3589,8 +3589,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3681,8 +3681,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -3730,8 +3730,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 @@ -3777,12 +3777,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] @@ -3818,13 +3818,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3] ; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] @@ -3858,8 +3858,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -3912,8 +3912,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v2f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v16, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index ea588df86b8467..4ea3323a9dbfc7 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -13,49 +13,51 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind rea define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: s_lshr_b32 s5, s4, 1 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_not_b32 s4, s6 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_alignbit_b32 v0, s5, v0, v1 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_not_b32 s5, s8 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 +; SI-NEXT: s_lshr_b32 s4, s6, 1 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_not_b32 s3, s6 -; VI-NEXT: s_lshr_b32 s2, s4, 1 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_lshr_b32 s1, s6, 1 +; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_not_b32 s3, s6 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_not_b32 s1, s2 +; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32: @@ -75,30 +77,30 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshl_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, 1 -; GFX10-NEXT: s_lshr_b32 s2, s4, 1 -; GFX10-NEXT: s_not_b32 s3, s6 -; GFX10-NEXT: v_alignbit_b32 v0, s2, v0, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, 1 +; GFX10-NEXT: s_lshr_b32 s0, s6, 1 +; GFX10-NEXT: s_not_b32 s1, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, 1 -; GFX11-NEXT: s_lshr_b32 s2, s4, 1 -; GFX11-NEXT: s_not_b32 s3, s6 +; GFX11-NEXT: v_alignbit_b32 v0, s6, s7, 1 +; GFX11-NEXT: s_lshr_b32 s1, s6, 1 +; GFX11-NEXT: s_not_b32 s0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, v0, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_alignbit_b32 v0, s1, v0, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -111,7 +113,7 @@ entry: define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshl_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -124,7 +126,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshl_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 @@ -135,7 +137,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -157,7 +159,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshl_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 @@ -166,7 +168,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 @@ -183,15 +185,15 @@ entry: define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: s_not_b32 s1, s1 ; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; SI-NEXT: s_not_b32 s1, s1 ; SI-NEXT: s_lshr_b32 s2, s5, 1 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 @@ -206,47 +208,47 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_not_b32 s1, s1 +; VI-NEXT: s_not_b32 s3, s3 ; VI-NEXT: s_lshr_b32 s7, s5, 1 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_not_b32 s2, s2 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: s_lshr_b32 s1, s4, 1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, v2 +; VI-NEXT: s_lshr_b32 s3, s4, 1 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_alignbit_b32 v0, s3, v0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s2, s5, 1 -; GFX9-NEXT: s_not_b32 s3, s9 +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: s_not_b32 s1, s9 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_not_b32 s3, s8 +; GFX9-NEXT: s_not_b32 s1, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -270,39 +272,39 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, s5, s7, 1 ; GFX10-NEXT: v_alignbit_b32 v3, s4, s6, 1 -; GFX10-NEXT: s_lshr_b32 s2, s5, 1 -; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_lshr_b32 s0, s5, 1 +; GFX10-NEXT: s_not_b32 s1, s3 ; GFX10-NEXT: s_lshr_b32 s3, s4, 1 -; GFX10-NEXT: s_not_b32 s0, s0 -; GFX10-NEXT: v_alignbit_b32 v1, s2, v0, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s0 +; GFX10-NEXT: s_not_b32 s2, s2 +; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1 ; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1 ; GFX11-NEXT: s_lshr_b32 s5, s5, 1 -; GFX11-NEXT: s_not_b32 s1, s1 +; GFX11-NEXT: s_not_b32 s3, s3 ; GFX11-NEXT: s_lshr_b32 s4, s4, 1 -; GFX11-NEXT: s_not_b32 s0, s0 -; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s1 -; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3 +; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -315,8 +317,8 @@ entry: define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshl_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -329,8 +331,8 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; VI-LABEL: fshl_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -343,15 +345,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; GFX9-LABEL: fshl_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: @@ -371,20 +373,20 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshl_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 23 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 25 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23 @@ -402,44 +404,44 @@ entry: define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_not_b32 s1, s19 ; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: s_not_b32 s11, s15 ; SI-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v3, s0, v0, v1 +; SI-NEXT: s_lshr_b32 s7, s7, 1 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_not_b32 s1, s18 +; SI-NEXT: s_not_b32 s7, s14 ; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s6, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v2, s0, v0, v1 +; SI-NEXT: s_lshr_b32 s6, s6, 1 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_not_b32 s1, s17 +; SI-NEXT: s_not_b32 s6, s13 ; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s5, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s0, v0, v1 +; SI-NEXT: s_lshr_b32 s5, s5, 1 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_not_b32 s1, s16 +; SI-NEXT: s_not_b32 s5, s12 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s4, 1 -; SI-NEXT: v_mov_b32_e32 v4, s1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: s_lshr_b32 s4, s4, 1 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: s_not_b32 s3, s15 @@ -472,36 +474,36 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_not_b32 s3, s15 +; GFX9-NEXT: s_not_b32 s1, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_lshr_b32 s2, s7, 1 +; GFX9-NEXT: s_lshr_b32 s0, s7, 1 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: s_not_b32 s3, s14 +; GFX9-NEXT: s_not_b32 s1, s14 ; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v2, s2, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: s_not_b32 s3, s13 +; GFX9-NEXT: s_not_b32 s1, s13 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s5, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_not_b32 s3, s12 +; GFX9-NEXT: s_not_b32 s1, s12 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32: @@ -532,11 +534,11 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX10-LABEL: fshl_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 ; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 @@ -560,9 +562,9 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-LABEL: fshl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1 @@ -594,10 +596,10 @@ entry: define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshl_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s10 @@ -607,13 +609,13 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 25 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 31 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -630,9 +632,9 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; ; GFX9-LABEL: fshl_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 @@ -666,22 +668,22 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshl_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 31 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 23 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 25 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31 @@ -702,7 +704,7 @@ entry: define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-LABEL: orxor2or1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -718,7 +720,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; VI-LABEL: orxor2or1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s2, 7 ; VI-NEXT: s_or_b32 s4, s3, s4 @@ -732,7 +734,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX9-LABEL: orxor2or1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s4, s2, 7 @@ -759,7 +761,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX10-LABEL: orxor2or1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s4, s2, 7 @@ -772,7 +774,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX11-LABEL: orxor2or1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s2, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index dbcebe6e07e3fe..e8310e73f9a475 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -22,40 +22,42 @@ declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_alignbit_b32 v0, s6, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32: @@ -72,24 +74,24 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshr_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, v0 +; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -103,7 +105,7 @@ entry: define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshr_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -116,7 +118,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshr_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 @@ -127,7 +129,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -149,7 +151,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshr_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 @@ -158,7 +160,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 @@ -175,9 +177,9 @@ entry: define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -192,33 +194,33 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshr_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm @@ -240,13 +242,13 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] @@ -255,16 +257,16 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-LABEL: fshr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -277,8 +279,8 @@ entry: define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshr_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -291,8 +293,8 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; VI-LABEL: fshr_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -305,15 +307,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; GFX9-LABEL: fshr_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: @@ -333,20 +335,20 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshr_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 9 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 7 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9 @@ -364,11 +366,11 @@ entry: define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s19, 0xf000 -; SI-NEXT: s_mov_b32 s18, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s15 @@ -382,14 +384,14 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v4, s12 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s15 @@ -410,10 +412,10 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX9-LABEL: fshr_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s15 @@ -451,9 +453,9 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s15 @@ -464,15 +466,15 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, v1 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, v4 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 @@ -496,10 +498,10 @@ entry: define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshr_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s10 @@ -509,13 +511,13 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 7 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -532,9 +534,9 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; ; GFX9-LABEL: fshr_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 @@ -566,22 +568,22 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshr_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1 diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll index 8fd201038ad160..8779bb0df0f711 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -3975,15 +3975,15 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) { define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_neg0: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 ; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 @@ -4005,17 +4005,18 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; ; GISEL-IEEE-LABEL: elim_redun_check_neg0: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4031,25 +4032,24 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_neg0: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -4063,24 +4063,22 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 ; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SDAG-DAZ-NEXT: s_endpgm ; ; GISEL-DAZ-LABEL: elim_redun_check_neg0: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4095,8 +4093,9 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4110,15 +4109,15 @@ entry: define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_pos0: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 ; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 @@ -4140,17 +4139,18 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; ; GISEL-IEEE-LABEL: elim_redun_check_pos0: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4165,25 +4165,24 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, s6, 0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, s2, 0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_pos0: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -4197,24 +4196,22 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 ; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SDAG-DAZ-NEXT: s_endpgm ; ; GISEL-DAZ-LABEL: elim_redun_check_pos0: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4228,8 +4225,9 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, s4, 0 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, s2, 0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4243,15 +4241,15 @@ entry: define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_ult: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 ; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 @@ -4273,17 +4271,18 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; ; GISEL-IEEE-LABEL: elim_redun_check_ult: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4299,25 +4298,24 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_nge_f32_e32 vcc, s6, v1 +; GISEL-IEEE-NEXT: v_cmp_nge_f32_e32 vcc, s2, v1 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_ult: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -4331,24 +4329,22 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 ; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SDAG-DAZ-NEXT: s_endpgm ; ; GISEL-DAZ-LABEL: elim_redun_check_ult: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4363,8 +4359,9 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_nge_f32_e32 vcc, s4, v1 +; GISEL-DAZ-NEXT: v_cmp_nge_f32_e32 vcc, s2, v1 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4378,7 +4375,7 @@ entry: define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 @@ -4426,7 +4423,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; GISEL-IEEE-LABEL: elim_redun_check_v2: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) @@ -4478,7 +4475,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; SDAG-DAZ-LABEL: elim_redun_check_v2: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-DAZ-NEXT: s_mov_b32 s7, 0xf000 @@ -4524,7 +4521,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; GISEL-DAZ-LABEL: elim_redun_check_v2: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) @@ -4582,7 +4579,7 @@ entry: define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2_ult: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 @@ -4630,7 +4627,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; GISEL-IEEE-LABEL: elim_redun_check_v2_ult: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) @@ -4682,7 +4679,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; SDAG-DAZ-LABEL: elim_redun_check_v2_ult: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-DAZ-NEXT: s_mov_b32 s7, 0xf000 @@ -4728,7 +4725,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; GISEL-DAZ-LABEL: elim_redun_check_v2_ult: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll index f6df1cbbdd06b8..f72d4e0e03633c 100644 --- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll @@ -7,58 +7,58 @@ define amdgpu_kernel void @fsub_f16( ; SI-LABEL: fsub_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX89-LABEL: fsub_f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX89-NEXT: s_mov_b32 s11, 0xf000 -; GFX89-NEXT: s_mov_b32 s10, -1 -; GFX89-NEXT: s_mov_b32 s14, s10 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_mov_b32 s14, s2 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s12, s6 ; GFX89-NEXT: s_mov_b32 s13, s7 -; GFX89-NEXT: s_mov_b32 s15, s11 -; GFX89-NEXT: s_mov_b32 s2, s10 -; GFX89-NEXT: s_mov_b32 s3, s11 +; GFX89-NEXT: s_mov_b32 s15, s3 +; GFX89-NEXT: s_mov_b32 s10, s2 +; GFX89-NEXT: s_mov_b32 s11, s3 ; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s4 -; GFX89-NEXT: s_mov_b32 s9, s5 +; GFX89-NEXT: s_mov_b32 s0, s4 +; GFX89-NEXT: s_mov_b32 s1, s5 ; GFX89-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -93,7 +93,7 @@ entry: define amdgpu_kernel void @fsub_f16_imm_a( ; SI-LABEL: fsub_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -113,7 +113,7 @@ define amdgpu_kernel void @fsub_f16_imm_a( ; ; GFX89-LABEL: fsub_f16_imm_a: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @fsub_f16_imm_a( ; ; GFX11-LABEL: fsub_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -160,7 +160,7 @@ entry: define amdgpu_kernel void @fsub_f16_imm_b( ; SI-LABEL: fsub_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @fsub_f16_imm_b( ; ; GFX89-LABEL: fsub_f16_imm_b: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -198,7 +198,7 @@ define amdgpu_kernel void @fsub_f16_imm_b( ; ; GFX11-LABEL: fsub_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -227,21 +227,21 @@ entry: define amdgpu_kernel void @fsub_v2f16( ; SI-LABEL: fsub_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -256,60 +256,60 @@ define amdgpu_kernel void @fsub_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fsub_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_f16_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fsub_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -343,7 +343,7 @@ entry: define amdgpu_kernel void @fsub_v2f16_imm_a( ; SI-LABEL: fsub_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; VI-LABEL: fsub_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -390,7 +390,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; GFX9-LABEL: fsub_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -409,7 +409,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; GFX11-LABEL: fsub_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -438,7 +438,7 @@ entry: define amdgpu_kernel void @fsub_v2f16_imm_b( ; SI-LABEL: fsub_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -464,7 +464,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; VI-LABEL: fsub_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -485,7 +485,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; GFX9-LABEL: fsub_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -504,7 +504,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; GFX11-LABEL: fsub_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 8846068e750d46..44a9127b4bd09c 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -6,7 +6,7 @@ define void @void_func_i1_inreg(i1 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i1_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s6, 1 +; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -28,7 +28,7 @@ define void @void_func_i8_inreg(i8 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i8_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -47,7 +47,7 @@ define void @void_func_i16_inreg(i16 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -66,7 +66,7 @@ define void @void_func_i32_inreg(i32 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -85,8 +85,8 @@ define void @void_func_i64_inreg(i64 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -105,7 +105,7 @@ define void @void_func_f16_inreg(half inreg %arg0) #0 { ; GFX9-LABEL: void_func_f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -124,7 +124,7 @@ define void @void_func_f32_inreg(float inreg %arg0) #0 { ; GFX9-LABEL: void_func_f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -143,8 +143,8 @@ define void @void_func_f64_inreg(double inreg %arg0) #0 { ; GFX9-LABEL: void_func_f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -163,7 +163,7 @@ define void @void_func_v2i16_inreg(<2 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -182,9 +182,9 @@ define void @void_func_v3i16_inreg(<3 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -214,8 +214,8 @@ define void @void_func_v4i16_inreg(<4 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -234,10 +234,10 @@ define void @void_func_v5i16_inreg(<5 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_store_short v[0:1], v0, off +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -259,10 +259,10 @@ define void @void_func_v8i16_inreg(<8 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -282,8 +282,8 @@ define void @void_func_v2i32_inreg(<2 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -302,9 +302,9 @@ define void @void_func_v3i32_inreg(<3 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dwordx3 v[0:1], v[0:2], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -324,10 +324,10 @@ define void @void_func_v4i32_inreg(<4 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -347,12 +347,12 @@ define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -360,7 +360,7 @@ define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v5i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_clause 0x1 @@ -375,16 +375,16 @@ define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -392,8 +392,8 @@ define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -437,28 +437,28 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -466,12 +466,12 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -488,33 +488,47 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v32i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s28 +; GFX9-NEXT: v_mov_b32_e32 v5, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -522,18 +536,24 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v32i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29 +; GFX11-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25 +; GFX11-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 -; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 +; GFX11-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 @@ -551,10 +571,10 @@ define void @void_func_v2i64_inreg(<2 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -574,13 +594,13 @@ define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -588,7 +608,7 @@ define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v3i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x1 @@ -603,16 +623,16 @@ define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -620,8 +640,8 @@ define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v4i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -636,20 +656,20 @@ define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -657,11 +677,11 @@ define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v5i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off @@ -675,28 +695,28 @@ define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -704,12 +724,12 @@ define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -726,33 +746,47 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s28 +; GFX9-NEXT: v_mov_b32_e32 v5, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -760,18 +794,24 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29 +; GFX11-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25 +; GFX11-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 -; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 +; GFX11-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 @@ -789,7 +829,7 @@ define void @void_func_v2f16_inreg(<2 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -808,9 +848,9 @@ define void @void_func_v3f16_inreg(<3 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -831,8 +871,8 @@ define void @void_func_v4f16_inreg(<4 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -851,10 +891,10 @@ define void @void_func_v8f16_inreg(<8 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -874,16 +914,16 @@ define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -891,8 +931,8 @@ define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -907,8 +947,8 @@ define void @void_func_v2f32_inreg(<2 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -927,9 +967,9 @@ define void @void_func_v3f32_inreg(<3 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dwordx3 v[0:1], v[0:2], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -949,10 +989,10 @@ define void @void_func_v4f32_inreg(<4 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -972,16 +1012,16 @@ define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -989,8 +1029,8 @@ define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1005,28 +1045,28 @@ define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1034,12 +1074,12 @@ define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -1056,10 +1096,10 @@ define void @void_func_v2f64_inreg(<2 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1079,13 +1119,13 @@ define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1093,7 +1133,7 @@ define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v3f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1108,16 +1148,16 @@ define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1125,8 +1165,8 @@ define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v4f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1141,28 +1181,28 @@ define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1170,12 +1210,12 @@ define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -1192,33 +1232,47 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s28 +; GFX9-NEXT: v_mov_b32_e32 v5, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1226,18 +1280,24 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29 +; GFX11-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25 +; GFX11-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 -; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 +; GFX11-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 @@ -1255,86 +1315,104 @@ define void @void_func_v32i32_i1_i8_i16_f32_inreg(<32 x i32> inreg %arg0, i1 inr ; GFX9-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GFX9-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[10:13], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v6 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_byte v[0:1], v17, off +; GFX9-NEXT: global_store_byte v[0:1], v7, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v[0:1], v18, off +; GFX9-NEXT: global_store_short v[0:1], v8, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v[0:1], v19, off +; GFX9-NEXT: global_store_short v[0:1], v9, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v17, s27 -; GFX11-NEXT: v_dual_mov_b32 v18, s28 :: v_dual_mov_b32 v19, s29 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29 +; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s20 :: v_dual_mov_b32 v15, s21 +; GFX11-NEXT: v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v17, s23 +; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 -; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 -; GFX11-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc +; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, s15 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9 +; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v13, s11 +; GFX11-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v15, s5 +; GFX11-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7 +; GFX11-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b8 v[0:1], v12, off dlc +; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b8 v[0:1], v13, off dlc +; GFX11-NEXT: global_store_b8 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b16 v[0:1], v14, off dlc +; GFX11-NEXT: global_store_b16 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b16 v[0:1], v15, off dlc +; GFX11-NEXT: global_store_b16 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef @@ -1349,76 +1427,94 @@ define void @void_func_v32i32_v2i32_v2f32_inreg(<32 x i32> inreg %arg0, <2 x i32 ; GFX9-LABEL: void_func_v32i32_v2i32_v2f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GFX9-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[10:13], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[16:17], off +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[6:7], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[18:19], off +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v2i32_v2f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29 +; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s20 :: v_dual_mov_b32 v15, s21 +; GFX11-NEXT: v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v17, s23 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 +; GFX11-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v7, s17 ; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7 -; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17 -; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1 -; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc +; GFX11-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13 +; GFX11-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v13, s15 +; GFX11-NEXT: v_dual_mov_b32 v14, s8 :: v_dual_mov_b32 v15, s9 +; GFX11-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v17, s11 +; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[20:23], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[22:25], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v[0:1], v[12:13], off dlc +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v[0:1], v[14:15], off dlc +; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef @@ -1431,147 +1527,156 @@ define void @too_many_args_use_workitem_id_x_inreg( ; GFX9-LABEL: too_many_args_use_workitem_id_x_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s6 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s7 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s16 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s17 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s18 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s21 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s22 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s23 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s24 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s25 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s26 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s16 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s27 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s17 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s28 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s29 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s19 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: v_mov_b32_e32 v6, s23 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: v_mov_b32_e32 v6, s24 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v5, off +; GFX9-NEXT: v_mov_b32_e32 v6, s25 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, s26 ; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v7, off +; GFX9-NEXT: v_mov_b32_e32 v6, s27 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v8, off +; GFX9-NEXT: v_mov_b32_e32 v6, s28 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v9, off +; GFX9-NEXT: v_mov_b32_e32 v6, s29 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v10, off +; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v11, off +; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v12, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v13, off +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v14, off +; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v15, off +; GFX9-NEXT: global_store_dword v[0:1], v5, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: too_many_args_use_workitem_id_x_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 -; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 -; GFX11-NEXT: v_mov_b32_e32 v16, s6 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v13, s16 :: v_dual_mov_b32 v12, s7 -; GFX11-NEXT: v_dual_mov_b32 v15, s18 :: v_dual_mov_b32 v14, s17 -; GFX11-NEXT: v_mov_b32_e32 v16, s19 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc +; GFX11-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v2, s5 +; GFX11-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v4, s7 +; GFX11-NEXT: v_mov_b32_e32 v6, s9 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v12, s20 -; GFX11-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v14, s22 -; GFX11-NEXT: v_mov_b32_e32 v16, s24 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc +; GFX11-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v2, s10 +; GFX11-NEXT: v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v4, s12 +; GFX11-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v12, s25 -; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27 -; GFX11-NEXT: v_mov_b32_e32 v16, s29 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc +; GFX11-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, s16 +; GFX11-NEXT: v_dual_mov_b32 v4, s17 :: v_dual_mov_b32 v5, s18 +; GFX11-NEXT: v_mov_b32_e32 v6, s19 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s21 +; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 +; GFX11-NEXT: v_mov_b32_e32 v6, s24 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc @@ -1582,15 +1687,22 @@ define void @too_many_args_use_workitem_id_x_inreg( ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v7, off dlc +; GFX11-NEXT: v_dual_mov_b32 v5, s28 :: v_dual_mov_b32 v2, s25 +; GFX11-NEXT: v_dual_mov_b32 v3, s26 :: v_dual_mov_b32 v4, s27 +; GFX11-NEXT: v_mov_b32_e32 v6, s29 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v8, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v9, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v10, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v11, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] i32 inreg %arg0, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, @@ -1643,10 +1755,10 @@ define void @void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inreg %arg ; GFX9-LABEL: void_func_i32_v2float_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1669,24 +1781,24 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX9-LABEL: caller_void_func_i32_v2float_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s17, s33 +; GFX9-NEXT: s_mov_b32 s7, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-NEXT: s_mov_b64 exec, s[8:9] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s17, 2 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s7, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s2, s16 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s2, s6 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1703,19 +1815,19 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s16, -1 +; GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s16 +; GFX11-NEXT: s_mov_b32 exec_lo, s4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: s_getpc_b64 s[16:17] -; GFX11-NEXT: s_add_u32 s16, s16, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s17, s17, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_add_u32 s4, s4, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s5, s5, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 ; GFX11-NEXT: v_writelane_b32 v40, s3, 2 -; GFX11-NEXT: s_load_b64 s[16:17], s[16:17], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -1762,7 +1874,7 @@ define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 { ; GFX9-LABEL: void_func_bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1781,7 +1893,7 @@ define void @void_func_v2bf16_inreg(<2 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1800,9 +1912,9 @@ define void @void_func_v3bf16_inreg(<3 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1823,8 +1935,8 @@ define void @void_func_v4bf16_inreg(<4 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1843,10 +1955,10 @@ define void @void_func_v8bf16_inreg(<8 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1866,16 +1978,16 @@ define void @void_func_v16bf16_inreg(<16 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1883,8 +1995,8 @@ define void @void_func_v16bf16_inreg(<16 x bfloat> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1899,10 +2011,10 @@ define void @void_func_2_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, ptr addrspa ; GFX9-LABEL: void_func_2_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1925,10 +2037,10 @@ define void @void_func_2_i64_inreg(i64 inreg %arg0, i64 inreg %arg1, ptr addrspa ; GFX9-LABEL: void_func_2_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -1954,13 +2066,13 @@ define void @void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg ; GFX9-LABEL: void_func_i64_inreg_i32_inreg_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_mov_b32_e32 v3, s18 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -1971,7 +2083,7 @@ define void @void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s1 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v3, s4 ; GFX11-NEXT: v_mov_b32_e32 v6, s2 ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1990,19 +2102,19 @@ define void @void_func_5_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, i32 inreg % ; GFX9-LABEL: void_func_5_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2012,7 +2124,7 @@ define void @void_func_5_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, i32 inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc @@ -2036,12 +2148,12 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) ; GFX9-LABEL: void_func_a5i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:16 -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2049,7 +2161,7 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) ; GFX11-LABEL: void_func_a5i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: s_clause 0x1 @@ -2064,93 +2176,6 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) declare void @extern() define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %ptr) { -; GFX9-LABEL: void_func_a13i32_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s27, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[28:29] -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:48 -; GFX9-NEXT: v_mov_b32_e32 v5, s25 -; GFX9-NEXT: v_mov_b32_e32 v4, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 -; GFX9-NEXT: v_writelane_b32 v40, s27, 2 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, extern@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, extern@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: void_func_a13i32_inreg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s23, s33 -; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s24, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s24 -; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v3, s19 -; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s17 -; GFX11-NEXT: s_getpc_b64 s[18:19] -; GFX11-NEXT: s_add_u32 s18, s18, extern@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s19, s19, extern@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v7, s7 -; GFX11-NEXT: s_load_b64 s[16:17], s[18:19], 0x0 -; GFX11-NEXT: v_writelane_b32 v40, s23, 2 -; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v5, s21 -; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v13, s3 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b32 v[0:1], v14, off offset:48 -; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:32 -; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 -; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] store [13 x i32] %arg0, ptr addrspace(1) %ptr call void @extern() ret void @@ -2178,52 +2203,6 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; FIXME: Should still fail define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addrspace(1) %ptr) { -; GFX9-LABEL: void_func_a16i32_inreg__noimplicit: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s29 -; GFX9-NEXT: v_mov_b32_e32 v4, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:48 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s25 -; GFX9-NEXT: v_mov_b32_e32 v4, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: void_func_a16i32_inreg__noimplicit: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s25 :: v_dual_mov_b32 v4, s24 -; GFX11-NEXT: v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s22 -; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20 -; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18 -; GFX11-NEXT: v_dual_mov_b32 v13, s17 :: v_dual_mov_b32 v12, s16 -; GFX11-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 -; GFX11-NEXT: v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2 -; GFX11-NEXT: v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:48 -; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:32 -; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:16 -; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off -; GFX11-NEXT: s_setpc_b64 s[30:31] store [16 x i32] %arg0, ptr addrspace(1) %ptr ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll index 2491cc0d19d5a1..1853aa9303095e 100644 --- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll +++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_or3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -31,7 +31,7 @@ bb: define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_or3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 @@ -61,7 +61,7 @@ bb: define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_and3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -89,7 +89,7 @@ bb: define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_and3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] @@ -122,7 +122,7 @@ bb: define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_xor3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -149,7 +149,7 @@ bb: define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_xor3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] @@ -180,7 +180,7 @@ bb: define amdgpu_kernel void @uniform_or3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_or3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -205,7 +205,7 @@ bb: define amdgpu_kernel void @uniform_or3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_or3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -232,7 +232,7 @@ bb: define amdgpu_kernel void @uniform_and3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_and3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -257,7 +257,7 @@ bb: define amdgpu_kernel void @uniform_and3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_and3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -284,7 +284,7 @@ bb: define amdgpu_kernel void @uniform_xor3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_xor3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -309,7 +309,7 @@ bb: define amdgpu_kernel void @uniform_xor3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_xor3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll index 1feae4dae6a09e..1a9334706cb927 100644 --- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll @@ -106,7 +106,7 @@ define amdgpu_kernel void @gds_global_align_plus_attr(ptr addrspace(1) %out) #0 define amdgpu_kernel void @gds_extern_align(ptr addrspace(1) %out, ptr addrspace(2) %gds.arg) #0 { ; GCN-LABEL: gds_extern_align: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x8 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x8 ; GCN-NEXT: v_mov_b32_e32 v0, 5 ; GCN-NEXT: s_movk_i32 m0, 0x401 ; GCN-NEXT: s_movk_i32 s1, 0x400 diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll index d70d45d44af0fd..944dcda5eba6f2 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll @@ -6,12 +6,12 @@ declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) # define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: IllegalGEPConst: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll index 0f951e89d37c8a..81239e841e097e 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll @@ -45,7 +45,7 @@ ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 0 -define amdgpu_kernel void @minimal_kernel_inputs() #0 { +define amdgpu_kernel void @minimal_kernel_inputs() { %id = call i32 @llvm.amdgcn.workgroup.id.x() store volatile i32 %id, ptr addrspace(1) undef ret void @@ -74,7 +74,7 @@ define amdgpu_kernel void @minimal_kernel_inputs() #0 { ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 0 -define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 { +define amdgpu_kernel void @minimal_kernel_inputs_with_stack() { %alloca = alloca i32, addrspace(5) %id = call i32 @llvm.amdgcn.workgroup.id.x() store volatile i32 %id, ptr addrspace(1) undef @@ -107,7 +107,7 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 { ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 2 -define amdgpu_kernel void @queue_ptr() #1 { +define amdgpu_kernel void @queue_ptr() { %queue.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 %load = load volatile i8, ptr addrspace(4) %queue.ptr %id = call i32 @llvm.amdgcn.workgroup.id.x() @@ -154,7 +154,7 @@ define amdgpu_kernel void @queue_ptr() #1 { ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 -define amdgpu_kernel void @all_inputs() #2 { +define amdgpu_kernel void @all_inputs() { %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -182,19 +182,16 @@ define amdgpu_kernel void @all_inputs() #2 { ret void } -declare i32 @llvm.amdgcn.workgroup.id.x() #3 -declare i32 @llvm.amdgcn.workgroup.id.y() #3 -declare i32 @llvm.amdgcn.workgroup.id.z() #3 -declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3 -declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #3 -declare align 4 ptr addrspace(4) @llvm.amdgcn.queue.ptr() #3 -declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #3 -declare i64 @llvm.amdgcn.dispatch.id() #3 - -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #1 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #2 = { "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +declare i32 @llvm.amdgcn.workgroup.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.y() #0 +declare i32 @llvm.amdgcn.workgroup.id.z() #0 +declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0 +declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 +declare align 4 ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 +declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i64 @llvm.amdgcn.dispatch.id() #0 + +attributes #0 = { nounwind readnone speculatable willreturn } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll index fb402b5ba30d12..be6f8a4375163a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll @@ -4,29 +4,29 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 { ; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[0:1], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GCN-NEXT: s_bcnt1_i32_b64 s1, s[6:7] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s8, s[2:3], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_mov_b32_e32 v5, v1 ; GCN-NEXT: v_add_f32_e32 v4, v5, v2 -; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[2:3] glc +; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[4:5] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v5 @@ -36,7 +36,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GCN-NEXT: ; %bb.3: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB0_4: ; %Flow2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: v_readfirstlane_b32 s0, v1 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, s0 @@ -52,20 +52,20 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 { ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[0:1], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: .LBB1_2: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 52fe2342d41a82..e312b37b2e0bbd 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32: +define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -27,7 +27,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -36,7 +36,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -46,7 +46,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off @@ -70,7 +70,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc @@ -78,7 +78,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -100,7 +100,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -122,7 +122,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -150,7 +150,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -179,12 +179,12 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -197,7 +197,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -206,7 +206,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -216,7 +216,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -240,7 +240,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc @@ -248,7 +248,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -270,7 +270,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -293,7 +293,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -321,7 +321,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -351,12 +351,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -369,7 +369,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -378,7 +378,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -388,7 +388,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -412,7 +412,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc @@ -420,7 +420,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -442,7 +442,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -465,7 +465,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -498,7 +498,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -533,12 +533,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32: +define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -551,7 +551,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -560,7 +560,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -570,7 +570,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off @@ -593,7 +593,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off @@ -601,7 +601,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off @@ -609,7 +609,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] @@ -630,7 +630,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -657,7 +657,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -685,12 +685,12 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -703,7 +703,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -712,7 +712,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -722,7 +722,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -745,7 +745,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 @@ -753,7 +753,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 @@ -761,7 +761,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -784,7 +784,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -811,7 +811,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -840,12 +840,12 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -858,7 +858,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -867,7 +867,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -877,7 +877,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 @@ -900,7 +900,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 @@ -908,7 +908,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 @@ -916,7 +916,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -939,7 +939,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -970,7 +970,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -1003,12 +1003,12 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1037,7 +1037,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -1046,7 +1046,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1072,7 +1072,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1096,7 +1096,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1120,7 +1120,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1142,7 +1142,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1165,7 +1165,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1193,7 +1193,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1223,12 +1223,12 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1255,7 +1255,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -1264,7 +1264,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 @@ -1288,7 +1288,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -1311,7 +1311,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 @@ -1334,7 +1334,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -1355,7 +1355,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -1378,7 +1378,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1405,7 +1405,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1434,16 +1434,12 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; float with ftz/daz -; -------------------------------------------------------------------- - -define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1451,34 +1447,34 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1486,7 +1482,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1499,25 +1495,25 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1529,36 +1525,37 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1567,7 +1564,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1579,14 +1576,14 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1596,7 +1593,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1608,12 +1605,13 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1626,7 +1624,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -1635,7 +1633,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1645,7 +1643,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1669,7 +1667,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc @@ -1677,7 +1675,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1699,7 +1697,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1722,7 +1720,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1750,7 +1748,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1780,12 +1778,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 ret float %result } -define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1793,181 +1791,154 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 -; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_add_f32_e32 v5, v6, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-NEXT: v_mov_b32_e32 v1, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: v_add_f32_e32 v5, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret float %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 + ret void } -define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1975,151 +1946,171 @@ define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2127,38 +2118,39 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc @@ -2166,115 +2158,131 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2282,234 +2290,210 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 + ret float %result } -define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc @@ -2517,207 +2501,147 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst - ret float %result + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + ret void } -define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -2740,51 +2664,23 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -2807,7 +2703,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -2834,7 +2730,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -2863,762 +2759,539 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @global_agent_atomic_fadd_ret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f64: +define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, v1 -; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 -; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v1 -; GFX6-NEXT: v_mov_b32_e32 v10, v0 -; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v8 -; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 -; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret void } -define double @global_agent_atomic_fadd_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, v1 -; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 -; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v1 -; GFX6-NEXT: v_mov_b32_e32 v10, v0 -; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v8 -; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 -; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 + ret void } -define double @global_agent_atomic_fadd_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, v1 -; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 -; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v1 -; GFX6-NEXT: v_mov_b32_e32 v10, v0 -; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v8 -; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 -; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f64: +define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -3626,406 +3299,321 @@ define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 -; GFX7-NEXT: v_mov_b32_e32 v9, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, v5 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } -define void @global_agent_atomic_fadd_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 -; GFX7-NEXT: v_mov_b32_e32 v9, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, v5 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 @@ -4033,1971 +3621,994 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 -; GFX7-NEXT: v_mov_b32_e32 v9, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, v5 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } -; -------------------------------------------------------------------- -; half -; -------------------------------------------------------------------- - -define half @global_agent_atomic_fadd_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16: +define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX7-NEXT: v_not_b32_e32 v7, v2 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX6-NEXT: v_not_b32_e32 v7, v2 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst - ret half %result + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define half @global_agent_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst - ret half %result + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.no.fine.grained.memory !0 + ret void } -define half @global_agent_atomic_fadd_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; -------------------------------------------------------------------- +; float with ftz/daz +; -------------------------------------------------------------------- + +define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst - ret half %result - } + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result +} -define void @global_agent_atomic_fadd_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16: +define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v6, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-NEXT: v_mov_b32_e32 v1, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: v_add_f32_e32 v5, v6, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6012,91 +4623,31 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6109,451 +4660,264 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_e32 v0, v1, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 - ret half %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -6566,65 +4930,33 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrs ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6637,460 +4969,310 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_system_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst - ret half %result -} + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result +} -define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7103,66 +5285,26 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -7176,27 +5318,17 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -7209,61 +5341,39 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -7275,27 +5385,17 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7308,1461 +5408,605 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; bfloat -; -------------------------------------------------------------------- - -define bfloat @global_agent_atomic_fadd_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16: +define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst - ret bfloat %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %result } -define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst - ret bfloat %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret void } -define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst - ret bfloat %result - } + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret float %result +} -define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16: +define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8777,109 +6021,31 @@ define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -8892,661 +6058,284 @@ define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } -define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB37_1 @@ -9554,1298 +6343,796 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret void } -define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v11, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_mov_b32_e32 v11, v1 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 +; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 - ret bfloat %result + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v11, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v11, v1 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 +; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v11, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v11, v1 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 +; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst - ret bfloat %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -10854,90 +7141,31 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -10946,38 +7174,23 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB41_1 @@ -10985,1371 +7198,1680 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v11, v7 +; GFX7-NEXT: v_mov_b32_e32 v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v9, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v11, v7 +; GFX6-NEXT: v_mov_b32_e32 v10, v6 +; GFX6-NEXT: v_mov_b32_e32 v9, v5 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; <2 x half> -; -------------------------------------------------------------------- - -define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16: +define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v11, v7 +; GFX7-NEXT: v_mov_b32_e32 v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v9, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v11, v7 +; GFX6-NEXT: v_mov_b32_e32 v10, v6 +; GFX6-NEXT: v_mov_b32_e32 v9, v5 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v11, v7 +; GFX7-NEXT: v_mov_b32_e32 v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v9, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v11, v7 +; GFX6-NEXT: v_mov_b32_e32 v10, v6 +; GFX6-NEXT: v_mov_b32_e32 v9, v5 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; -------------------------------------------------------------------- +; half +; -------------------------------------------------------------------- + +define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 +; GFX7-NEXT: v_not_b32_e32 v7, v2 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 +; GFX6-NEXT: v_not_b32_e32 v7, v2 ; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v4, v7 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result + %result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret half %result } -define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16: +define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst - ret void + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret half %result } -define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst - ret void -} + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret half %result + } -define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -12363,17 +8885,26 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -12386,35 +8917,88 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -12427,116 +9011,10205 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_e32 v0, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ret half %result +} + +define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 +; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret half %result +} + +define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +; -------------------------------------------------------------------- +; bfloat +; -------------------------------------------------------------------- + +define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 +; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result +} + +define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 +; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result +} + +define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 +; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result + } + +define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB58_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB58_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB58_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB59_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB59_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB59_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB60_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 +; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 +; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB60_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB60_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result +} + +define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB61_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 +; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 +; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB61_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB61_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB62_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB62_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB62_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 +; GFX10-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB62_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB62_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB62_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB62_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB62_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB62_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result +} + +define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB63_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB63_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB63_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB63_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB63_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB63_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB63_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB63_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB63_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +; -------------------------------------------------------------------- +; <2 x half> +; -------------------------------------------------------------------- + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB64_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB64_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB64_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB64_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB64_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB64_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB65_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB65_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB65_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB65_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB65_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB65_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB66_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB66_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB66_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB66_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB66_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB66_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB67_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB67_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB67_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB67_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB67_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB68_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB68_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB68_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB68_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB68_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB69_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB69_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB69_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB69_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB69_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB70_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB70_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB70_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB70_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB70_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB70_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB71_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB71_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB71_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB71_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB71_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB71_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB72_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB72_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB72_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB72_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB72_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB72_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret <2 x half> %result +} + +define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB73_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB73_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB73_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB73_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB73_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret void +} + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB74_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB74_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB74_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB74_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB74_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB74_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret <2 x half> %result +} + +define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB75_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB75_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB75_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB75_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB75_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret void +} + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB76_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB76_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB76_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB76_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB76_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB76_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB77_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB77_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB77_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB77_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB77_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB77_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +; -------------------------------------------------------------------- +; <2 x bfloat> +; -------------------------------------------------------------------- + +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB78_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB78_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB78_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB78_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB78_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB78_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB79_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB79_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB79_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB79_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB79_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB79_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB80_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB80_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB80_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB80_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB80_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB80_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB80_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB81_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB81_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB81_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB81_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB81_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB81_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_cbranch_execnz .LBB81_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12544,128 +19217,227 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB82_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB82_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB82_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB82_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB82_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12673,48 +19445,42 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_cbranch_execnz .LBB82_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12722,55 +19488,49 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_cbranch_execnz .LBB82_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst - ret <2 x half> %result + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12778,229 +19538,328 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB83_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB83_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB83_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB83_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB83_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cbranch_execnz .LBB83_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_cbranch_execnz .LBB83_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; <2 x bfloat> -; -------------------------------------------------------------------- - -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16: +define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13008,30 +19867,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13056,7 +19915,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -13064,21 +19923,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_cbranch_execnz .LBB84_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -13098,29 +19957,29 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_cbranch_execnz .LBB84_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -13139,28 +19998,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB84_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -13179,67 +20040,68 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cbranch_execnz .LBB84_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX8-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cbranch_execnz .LBB84_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX7-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -13248,7 +20110,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13262,7 +20124,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -13270,21 +20132,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cbranch_execnz .LBB84_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -13293,7 +20155,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13308,7 +20170,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -13316,19 +20178,20 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cbranch_execnz .LBB84_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13336,232 +20199,229 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_cbranch_execnz .LBB85_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_cbranch_execnz .LBB85_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cbranch_execnz .LBB85_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cbranch_execnz .LBB85_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13570,43 +20430,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cbranch_execnz .LBB85_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13615,50 +20473,48 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cbranch_execnz .LBB85_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst - ret <2 x bfloat> %result + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13666,30 +20522,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13714,7 +20570,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -13722,21 +20578,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_cbranch_execnz .LBB86_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -13756,29 +20612,29 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_cbranch_execnz .LBB86_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -13797,28 +20653,28 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -13837,162 +20693,156 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_cbranch_execnz .LBB86_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_cbranch_execnz .LBB86_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cbranch_execnz .LBB86_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cbranch_execnz .LBB86_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %result } -define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16: +define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14005,7 +20855,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -14014,7 +20864,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -14023,7 +20873,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14055,20 +20905,20 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_cbranch_execnz .LBB87_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14095,12 +20945,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_cbranch_execnz .LBB87_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -14109,7 +20959,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14134,12 +20984,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -14148,7 +20998,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14173,19 +21023,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cbranch_execnz .LBB87_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14213,12 +21063,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cbranch_execnz .LBB87_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14234,7 +21084,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -14256,12 +21106,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cbranch_execnz .LBB87_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14277,7 +21127,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -14300,17 +21150,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cbranch_execnz .LBB87_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } -define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14318,320 +21168,327 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_cbranch_execnz .LBB88_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_cbranch_execnz .LBB88_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cbranch_execnz .LBB88_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cbranch_execnz .LBB88_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cbranch_execnz .LBB88_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cbranch_execnz .LBB88_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret <2 x bfloat> %result } -define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14639,30 +21496,30 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14685,7 +21542,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -14694,20 +21551,20 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_cbranch_execnz .LBB89_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14726,7 +21583,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -14734,21 +21591,21 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_cbranch_execnz .LBB89_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14766,28 +21623,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB89_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14805,28 +21662,26 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cbranch_execnz .LBB89_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14854,32 +21709,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cbranch_execnz .LBB89_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -14901,32 +21752,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cbranch_execnz .LBB89_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -14949,18 +21796,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cbranch_execnz .LBB89_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret void } -define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14968,30 +21814,30 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -15016,7 +21862,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -15024,21 +21870,21 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_cbranch_execnz .LBB90_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -15058,29 +21904,29 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_cbranch_execnz .LBB90_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -15099,30 +21945,28 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB90_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -15141,68 +21985,67 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_cbranch_execnz .LBB90_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_cbranch_execnz .LBB90_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -15211,7 +22054,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -15225,7 +22068,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -15233,21 +22076,21 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cbranch_execnz .LBB90_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -15256,7 +22099,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -15271,7 +22114,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -15279,20 +22122,19 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cbranch_execnz .LBB90_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15300,30 +22142,30 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15346,7 +22188,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -15355,20 +22197,20 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_cbranch_execnz .LBB91_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15387,7 +22229,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -15395,21 +22237,21 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_cbranch_execnz .LBB91_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15427,30 +22269,28 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB91_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15468,28 +22308,26 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cbranch_execnz .LBB91_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15517,19 +22355,19 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cbranch_execnz .LBB91_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -15538,7 +22376,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -15552,7 +22390,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -15560,19 +22398,19 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cbranch_execnz .LBB91_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -15581,7 +22419,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -15596,7 +22434,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -15604,13 +22442,12 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cbranch_execnz .LBB91_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret void } @@ -15621,162 +22458,162 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-LABEL: infer_as_before_atomic: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB58_2 +; GFX12-NEXT: s_cbranch_execz .LBB92_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[2:3] -; GFX12-NEXT: .LBB58_2: +; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX12-NEXT: .LBB92_2: ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX940-LABEL: infer_as_before_atomic: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB58_2 +; GFX940-NEXT: s_cbranch_execz .LBB92_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[2:3] -; GFX940-NEXT: .LBB58_2: +; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX940-NEXT: .LBB92_2: ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: infer_as_before_atomic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB58_2 +; GFX11-NEXT: s_cbranch_execz .LBB92_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[2:3] -; GFX11-NEXT: .LBB58_2: +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: .LBB92_2: ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: infer_as_before_atomic: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s5, exec_lo -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB58_3 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB92_3 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: .LBB58_2: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB58_2 -; GFX10-NEXT: .LBB58_3: +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB92_2 +; GFX10-NEXT: .LBB92_3: ; GFX10-NEXT: s_endpgm ; ; GFX90A-LABEL: infer_as_before_atomic: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB58_2 +; GFX90A-NEXT: s_cbranch_execz .LBB92_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[2:3] -; GFX90A-NEXT: .LBB58_2: +; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX90A-NEXT: .LBB92_2: ; GFX90A-NEXT: s_endpgm ; ; GFX908-LABEL: infer_as_before_atomic: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_cbranch_execz .LBB58_2 +; GFX908-NEXT: s_cbranch_execz .LBB92_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[2:3] -; GFX908-NEXT: .LBB58_2: +; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX908-NEXT: .LBB92_2: ; GFX908-NEXT: s_endpgm ; ; GFX8-LABEL: infer_as_before_atomic: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB58_3 +; GFX8-NEXT: s_cbranch_execz .LBB92_3 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX8-NEXT: s_bcnt1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v4, s5 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v4, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: .LBB58_2: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v4 ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -15785,32 +22622,32 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB58_2 -; GFX8-NEXT: .LBB58_3: +; GFX8-NEXT: s_cbranch_execnz .LBB92_2 +; GFX8-NEXT: .LBB92_3: ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: infer_as_before_atomic: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b64 s[4:5], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7-NEXT: s_cbranch_execz .LBB58_3 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_cbranch_execz .LBB92_3 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: .LBB58_2: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 @@ -15821,32 +22658,32 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB58_2 -; GFX7-NEXT: .LBB58_3: +; GFX7-NEXT: s_cbranch_execnz .LBB92_2 +; GFX7-NEXT: .LBB92_3: ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: infer_as_before_atomic: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB58_3 +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX6-NEXT: s_cbranch_execz .LBB92_3 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX6-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: .LBB58_2: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -15858,13 +22695,15 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB58_2 -; GFX6-NEXT: .LBB58_3: +; GFX6-NEXT: s_cbranch_execnz .LBB92_2 +; GFX6-NEXT: .LBB92_3: ; GFX6-NEXT: s_endpgm %load = load ptr, ptr addrspace(4) %arg - %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4 + %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index ae5dca4aa86fb5..e7d62fdc00cfff 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32: +define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -27,7 +27,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -52,7 +52,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -62,7 +62,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -72,7 +72,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -96,7 +96,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -120,7 +120,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -144,7 +144,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -157,7 +157,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -170,12 +170,12 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -188,7 +188,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -213,7 +213,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -223,7 +223,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -233,7 +233,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -257,7 +257,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -281,7 +281,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -306,7 +306,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -319,7 +319,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -333,12 +333,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -351,7 +351,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -376,7 +376,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -386,7 +386,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -396,7 +396,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -420,7 +420,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -444,7 +444,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -469,7 +469,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -482,7 +482,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -496,12 +496,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32: +define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -514,7 +514,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -538,7 +538,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -548,7 +548,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -558,7 +558,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -581,7 +581,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -604,7 +604,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -627,7 +627,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -639,7 +639,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -651,12 +651,12 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -669,7 +669,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -693,7 +693,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -703,7 +703,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -713,7 +713,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -736,7 +736,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -759,7 +759,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -784,7 +784,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -796,7 +796,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -809,12 +809,12 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -827,7 +827,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -851,7 +851,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -861,7 +861,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -871,7 +871,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -894,7 +894,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -917,7 +917,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -942,7 +942,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -954,7 +954,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -967,12 +967,12 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1003,7 +1003,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1028,7 +1028,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1056,7 +1056,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1082,7 +1082,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1108,7 +1108,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1132,7 +1132,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1157,7 +1157,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1187,7 +1187,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1219,12 +1219,12 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1254,7 +1254,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1278,7 +1278,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1305,7 +1305,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1330,7 +1330,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1355,7 +1355,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1378,7 +1378,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -1403,7 +1403,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1432,7 +1432,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1463,16 +1463,12 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; float with ftz/daz -; -------------------------------------------------------------------- - -define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1485,7 +1481,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -1510,7 +1506,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1520,7 +1516,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1530,7 +1526,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -1554,7 +1550,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -1578,7 +1574,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -1602,7 +1598,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1615,7 +1611,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1628,12 +1624,12 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1641,15 +1637,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1659,7 +1655,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1671,30 +1667,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1703,7 +1699,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1715,10 +1711,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1727,7 +1723,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1739,64 +1735,66 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; -------------------------------------------------------------------- +; float with ftz/daz +; -------------------------------------------------------------------- + +define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1804,15 +1802,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1822,7 +1820,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1834,30 +1832,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1866,7 +1864,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1878,10 +1876,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1890,7 +1888,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1902,64 +1900,62 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1967,154 +1963,162 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2122,157 +2126,162 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2280,15 +2289,15 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2297,7 +2306,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2309,30 +2318,30 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048 +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2340,7 +2349,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2352,10 +2361,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2363,7 +2372,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2375,11 +2384,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2400,322 +2407,210 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst - ret float %result + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2723,10 +2618,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 @@ -2736,62 +2631,30 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2799,10 +2662,8 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2813,10 +2674,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2824,7 +2685,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2836,11 +2697,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2861,7 +2722,222 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 +; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -2870,77 +2946,666 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX6-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64: +define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -2948,35 +3613,35 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -2984,7 +3649,7 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -2992,89 +3657,89 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -3082,13 +3747,13 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3096,373 +3761,355 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } -define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64: +define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3470,34 +4117,34 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3506,41 +4153,41 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] @@ -3548,18 +4195,20 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3572,373 +4221,388 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result } -define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret double %result } ; -------------------------------------------------------------------- ; half ; -------------------------------------------------------------------- -define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16: +define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3956,7 +4620,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -3977,13 +4641,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -3996,7 +4660,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4012,13 +4676,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -4032,7 +4696,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4054,13 +4718,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -4072,7 +4736,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4089,13 +4753,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -4108,7 +4772,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4123,13 +4787,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -4142,7 +4806,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4157,13 +4821,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -4176,7 +4840,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4192,13 +4856,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4215,7 +4879,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4234,14 +4898,14 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -4258,7 +4922,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4277,19 +4941,19 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4308,7 +4972,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4329,13 +4993,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -4350,7 +5014,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4366,13 +5030,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4387,7 +5051,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4409,13 +5073,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4428,7 +5092,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4445,13 +5109,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4465,7 +5129,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4480,13 +5144,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4500,7 +5164,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4515,13 +5179,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -4535,7 +5199,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4551,13 +5215,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4575,7 +5239,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4594,14 +5258,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4619,7 +5283,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4639,7 +5303,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -4647,12 +5311,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4671,7 +5335,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4692,13 +5356,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -4714,7 +5378,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4730,13 +5394,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: s_cbranch_execnz .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4751,7 +5415,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4773,13 +5437,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4792,7 +5456,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4809,13 +5473,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4829,7 +5493,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4844,13 +5508,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4864,7 +5528,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4879,13 +5543,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4899,7 +5563,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4915,13 +5579,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -4939,7 +5603,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4958,14 +5622,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -4983,7 +5647,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5003,7 +5667,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -5011,12 +5675,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16: +define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5034,7 +5698,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5055,12 +5719,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -5073,7 +5737,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5089,12 +5753,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: s_cbranch_execnz .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -5108,7 +5772,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5130,12 +5794,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_cbranch_execnz .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -5147,7 +5811,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5164,12 +5828,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_cbranch_execnz .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -5182,7 +5846,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5197,12 +5861,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -5215,7 +5879,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5230,12 +5894,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cbranch_execnz .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -5248,7 +5912,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5264,12 +5928,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cbranch_execnz .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5286,7 +5950,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5305,12 +5969,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -5327,7 +5991,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5347,17 +6011,17 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5376,7 +6040,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5397,12 +6061,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -5417,7 +6081,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5433,12 +6097,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5453,7 +6117,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5475,12 +6139,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5493,7 +6157,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5510,12 +6174,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5529,7 +6193,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5544,12 +6208,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5563,7 +6227,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5578,12 +6242,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -5597,7 +6261,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5613,12 +6277,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5636,7 +6300,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5655,12 +6319,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5678,7 +6342,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5698,18 +6362,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5728,7 +6392,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5749,12 +6413,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -5770,7 +6434,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5786,12 +6450,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5806,7 +6470,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5828,12 +6492,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5846,7 +6510,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5863,12 +6527,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_cbranch_execnz .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5882,7 +6546,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5897,12 +6561,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5916,7 +6580,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5931,12 +6595,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5950,7 +6614,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5966,12 +6630,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -5989,7 +6653,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6008,12 +6672,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6031,7 +6695,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6051,18 +6715,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6072,7 +6736,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -6090,20 +6754,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -6117,19 +6781,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_cbranch_execnz .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -6148,19 +6812,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_cbranch_execnz .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -6176,20 +6840,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_cbranch_execnz .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -6202,20 +6866,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -6228,13 +6892,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6242,7 +6906,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6256,12 +6920,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6272,7 +6936,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6289,13 +6953,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6306,7 +6970,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6324,19 +6988,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6346,7 +7010,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -6364,19 +7028,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6390,18 +7054,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6420,18 +7084,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_cbranch_execnz .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6447,19 +7111,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6472,19 +7136,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6497,12 +7161,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -6510,7 +7174,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6524,12 +7188,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6540,7 +7204,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6557,12 +7221,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6573,7 +7237,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6591,18 +7255,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6621,7 +7285,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -6642,13 +7306,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -6663,7 +7327,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -6679,13 +7343,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cbranch_execnz .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6700,7 +7364,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -6722,13 +7386,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6741,7 +7405,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6758,13 +7422,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6778,7 +7442,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -6795,13 +7459,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6815,7 +7479,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -6830,13 +7494,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6850,7 +7514,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6866,13 +7530,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6890,7 +7554,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6909,14 +7573,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6934,7 +7598,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6954,7 +7618,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -6962,12 +7626,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6986,7 +7650,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7007,12 +7671,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7027,7 +7691,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7043,12 +7707,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7063,7 +7727,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7085,12 +7749,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_cbranch_execnz .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7103,7 +7767,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7120,12 +7784,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_cbranch_execnz .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7139,7 +7803,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7156,12 +7820,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7175,7 +7839,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7190,12 +7854,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -7209,7 +7873,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7225,12 +7889,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7248,7 +7912,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7267,12 +7931,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7290,7 +7954,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7310,13 +7974,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -7324,8 +7988,8 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; bfloat ; -------------------------------------------------------------------- -define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16: +define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7342,7 +8006,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7370,13 +8034,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -7390,7 +8054,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7412,13 +8076,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_cbranch_execnz .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -7432,7 +8096,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7461,13 +8125,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -7479,7 +8143,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7500,13 +8164,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_cbranch_execnz .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -7520,7 +8184,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7539,13 +8203,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -7559,7 +8223,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7578,13 +8242,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_cbranch_execnz .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -7597,7 +8261,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -7618,13 +8282,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_cbranch_execnz .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7641,7 +8305,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7661,14 +8325,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -7685,7 +8349,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7705,19 +8369,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7736,7 +8400,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7764,13 +8428,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7786,7 +8450,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7808,13 +8472,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cbranch_execnz .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7830,7 +8494,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7859,13 +8523,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7878,7 +8542,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7899,13 +8563,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_cbranch_execnz .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7920,7 +8584,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7939,13 +8603,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7960,7 +8624,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7979,13 +8643,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cbranch_execnz .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -7999,7 +8663,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8020,13 +8684,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8044,7 +8708,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8064,14 +8728,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8089,7 +8753,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8110,7 +8774,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8118,12 +8782,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8142,7 +8806,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8170,13 +8834,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -8193,7 +8857,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8215,13 +8879,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_cbranch_execnz .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8237,7 +8901,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8266,13 +8930,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_cbranch_execnz .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8285,7 +8949,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8306,13 +8970,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_cbranch_execnz .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8327,7 +8991,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8346,13 +9010,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8367,7 +9031,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8386,13 +9050,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_cbranch_execnz .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8406,7 +9070,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8427,13 +9091,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8451,7 +9115,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8471,14 +9135,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8496,7 +9160,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8517,7 +9181,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8525,12 +9189,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16: +define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8547,7 +9211,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8574,12 +9238,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -8593,7 +9257,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8615,12 +9279,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cbranch_execnz .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -8634,7 +9298,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8662,12 +9326,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_cbranch_execnz .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -8679,7 +9343,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8700,12 +9364,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_cbranch_execnz .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -8719,7 +9383,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8738,12 +9402,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -8757,7 +9421,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8776,12 +9440,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cbranch_execnz .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -8794,7 +9458,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8815,12 +9479,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -8837,7 +9501,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8857,12 +9521,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -8879,7 +9543,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8900,17 +9564,17 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8929,7 +9593,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8956,12 +9620,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -8977,7 +9641,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8999,12 +9663,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_cbranch_execnz .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9020,7 +9684,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9048,12 +9712,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_cbranch_execnz .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9066,7 +9730,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9087,12 +9751,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_cbranch_execnz .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9107,7 +9771,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9126,12 +9790,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9146,7 +9810,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9165,12 +9829,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_cbranch_execnz .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -9184,7 +9848,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9205,12 +9869,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9228,7 +9892,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9248,12 +9912,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9271,7 +9935,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9292,18 +9956,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9322,7 +9986,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9349,12 +10013,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -9371,7 +10035,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9393,12 +10057,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_cbranch_execnz .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9414,7 +10078,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9442,12 +10106,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_cbranch_execnz .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9460,7 +10124,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9481,12 +10145,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_cbranch_execnz .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9501,7 +10165,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9520,12 +10184,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9540,7 +10204,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9559,12 +10223,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_cbranch_execnz .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9578,7 +10242,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9599,12 +10263,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9622,7 +10286,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9642,12 +10306,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9665,7 +10329,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9686,18 +10350,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9707,7 +10371,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -9732,13 +10396,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9746,7 +10410,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -9767,20 +10431,20 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_cbranch_execnz .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -9806,19 +10470,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_cbranch_execnz .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -9839,13 +10503,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_cbranch_execnz .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9853,7 +10517,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -9872,13 +10536,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9886,7 +10550,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -9905,13 +10569,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_cbranch_execnz .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -9919,7 +10583,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -9939,12 +10603,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -9955,7 +10619,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -9973,13 +10637,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -9990,7 +10654,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10009,19 +10673,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10031,7 +10695,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10055,12 +10719,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10068,7 +10732,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10089,19 +10753,19 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cbranch_execnz .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10126,18 +10790,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_cbranch_execnz .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10158,12 +10822,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_cbranch_execnz .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10171,7 +10835,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10190,12 +10854,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10203,7 +10867,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10222,12 +10886,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cbranch_execnz .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -10235,7 +10899,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10255,12 +10919,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -10271,7 +10935,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10289,12 +10953,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10305,7 +10969,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10324,18 +10988,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10354,7 +11018,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -10382,13 +11046,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10404,7 +11068,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -10426,13 +11090,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cbranch_execnz .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10448,7 +11112,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -10477,13 +11141,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_cbranch_execnz .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10496,7 +11160,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10517,13 +11181,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_cbranch_execnz .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10538,7 +11202,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10559,13 +11223,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10580,7 +11244,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10599,13 +11263,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cbranch_execnz .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -10619,7 +11283,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10640,13 +11304,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cbranch_execnz .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10664,7 +11328,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10684,14 +11348,14 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10709,7 +11373,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10730,7 +11394,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -10738,12 +11402,12 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10762,7 +11426,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10789,12 +11453,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10810,7 +11474,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10832,12 +11496,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cbranch_execnz .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10853,7 +11517,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10881,12 +11545,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_cbranch_execnz .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10899,7 +11563,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10920,12 +11584,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_cbranch_execnz .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -10940,7 +11604,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10961,12 +11625,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -10981,7 +11645,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11000,12 +11664,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cbranch_execnz .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -11019,7 +11683,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11040,12 +11704,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11063,7 +11727,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11083,12 +11747,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11106,7 +11770,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11127,13 +11791,13 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11141,8 +11805,8 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11152,7 +11816,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11167,19 +11831,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11193,19 +11857,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_cbranch_execnz .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11221,19 +11885,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_cbranch_execnz .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11247,19 +11911,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_cbranch_execnz .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11271,19 +11935,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11295,20 +11959,20 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_cbranch_execnz .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -11323,13 +11987,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11346,7 +12010,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11371,14 +12035,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11395,7 +12059,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11421,19 +12085,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11443,7 +12107,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11458,19 +12122,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11484,19 +12148,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cbranch_execnz .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11512,19 +12176,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_cbranch_execnz .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11538,19 +12202,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_cbranch_execnz .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11562,19 +12226,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11586,13 +12250,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cbranch_execnz .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -11601,7 +12265,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11616,12 +12280,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cbranch_execnz .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11638,7 +12302,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11663,14 +12327,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11687,7 +12351,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11713,7 +12377,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -11721,12 +12385,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11736,7 +12400,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11751,19 +12415,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11777,19 +12441,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_cbranch_execnz .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11805,19 +12469,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_cbranch_execnz .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11831,19 +12495,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_cbranch_execnz .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11855,19 +12519,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11879,13 +12543,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_cbranch_execnz .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -11894,7 +12558,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11909,12 +12573,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -11935,7 +12599,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -11960,12 +12624,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -11986,7 +12650,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12012,18 +12676,18 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16: +define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12033,7 +12697,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12048,18 +12712,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12073,18 +12737,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12100,18 +12764,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_cbranch_execnz .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12125,18 +12789,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_cbranch_execnz .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12148,18 +12812,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12171,19 +12835,19 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12198,12 +12862,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12220,7 +12884,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12245,12 +12909,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12267,7 +12931,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12293,17 +12957,17 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12313,7 +12977,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12328,18 +12992,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12353,18 +13017,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12380,18 +13044,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12405,18 +13069,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_cbranch_execnz .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12428,18 +13092,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12451,12 +13115,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -12465,7 +13129,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12480,12 +13144,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12502,7 +13166,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12527,12 +13191,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12549,7 +13213,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12575,18 +13239,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12596,7 +13260,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12611,18 +13275,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12636,18 +13300,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12663,18 +13327,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12688,18 +13352,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_cbranch_execnz .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12711,18 +13375,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12734,12 +13398,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -12748,7 +13412,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12763,12 +13427,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -12789,7 +13453,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12814,12 +13478,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12840,7 +13504,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12866,18 +13530,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12887,7 +13551,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -12902,19 +13566,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -12928,19 +13592,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_cbranch_execnz .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -12956,19 +13620,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -12982,19 +13646,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_cbranch_execnz .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -13008,19 +13672,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -13032,13 +13696,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_cbranch_execnz .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -13047,7 +13711,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -13062,12 +13726,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13084,7 +13748,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13109,14 +13773,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13133,7 +13797,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13159,7 +13823,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -13167,12 +13831,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13182,7 +13846,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -13197,18 +13861,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13222,18 +13886,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13249,18 +13913,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13274,18 +13938,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_cbranch_execnz .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13299,18 +13963,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13322,12 +13986,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -13336,7 +14000,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -13351,12 +14015,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13373,7 +14037,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13398,12 +14062,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13420,7 +14084,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13446,13 +14110,13 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13460,8 +14124,8 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13472,7 +14136,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13504,13 +14168,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -13519,7 +14183,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13546,13 +14210,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -13561,7 +14225,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13594,21 +14258,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -13635,13 +14299,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_cbranch_execnz .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -13650,7 +14314,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -13675,13 +14339,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -13690,7 +14354,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -13715,20 +14379,20 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -13756,13 +14420,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13778,7 +14442,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13800,14 +14464,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13823,7 +14487,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13846,19 +14510,19 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13869,7 +14533,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13901,13 +14565,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -13916,7 +14580,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13943,13 +14607,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -13958,7 +14622,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13991,21 +14655,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14032,13 +14696,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_cbranch_execnz .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14047,7 +14711,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14072,13 +14736,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14087,7 +14751,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14112,13 +14776,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -14127,7 +14791,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14155,12 +14819,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14176,7 +14840,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14198,14 +14862,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14221,7 +14885,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14244,7 +14908,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -14252,12 +14916,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14268,7 +14932,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14300,13 +14964,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14315,7 +14979,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -14342,13 +15006,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_cbranch_execnz .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -14357,7 +15021,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14390,21 +15054,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14431,13 +15095,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_cbranch_execnz .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14446,7 +15110,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14471,13 +15135,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14486,7 +15150,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14511,13 +15175,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_cbranch_execnz .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -14526,7 +15190,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14554,12 +15218,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -14579,7 +15243,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14601,12 +15265,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -14626,7 +15290,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14649,18 +15313,18 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16: +define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14671,7 +15335,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14702,12 +15366,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -14716,7 +15380,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14743,12 +15407,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -14757,7 +15421,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14789,20 +15453,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14829,12 +15493,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_cbranch_execnz .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -14843,7 +15507,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14868,12 +15532,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -14882,7 +15546,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14907,19 +15571,19 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14947,12 +15611,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14968,7 +15632,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -14990,12 +15654,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15011,7 +15675,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15034,17 +15698,17 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15055,7 +15719,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15086,12 +15750,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15100,7 +15764,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15127,12 +15791,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15141,7 +15805,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15173,20 +15837,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15213,12 +15877,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_cbranch_execnz .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15227,7 +15891,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15252,12 +15916,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15266,7 +15930,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15291,12 +15955,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -15305,7 +15969,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15333,12 +15997,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15354,7 +16018,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15376,12 +16040,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15397,7 +16061,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15420,18 +16084,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cbranch_execnz .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15442,7 +16106,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15473,12 +16137,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15487,7 +16151,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15514,12 +16178,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -15528,7 +16192,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15560,20 +16224,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15600,12 +16264,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_cbranch_execnz .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15614,7 +16278,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15639,12 +16303,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15653,7 +16317,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15678,12 +16342,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -15692,7 +16356,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15720,12 +16384,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -15745,7 +16409,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15767,12 +16431,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -15792,7 +16456,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15815,18 +16479,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cbranch_execnz .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15837,7 +16501,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -15869,13 +16533,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_cbranch_execnz .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15884,7 +16548,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -15911,13 +16575,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_cbranch_execnz .LBB60_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15926,7 +16590,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -15959,21 +16623,21 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16000,13 +16664,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_cbranch_execnz .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16015,7 +16679,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16042,13 +16706,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16057,7 +16721,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16082,13 +16746,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_cbranch_execnz .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -16097,7 +16761,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16125,12 +16789,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_cbranch_execnz .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16146,7 +16810,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16168,14 +16832,14 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16191,7 +16855,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16214,7 +16878,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cbranch_execnz .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -16222,12 +16886,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16238,7 +16902,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16269,12 +16933,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16283,7 +16947,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16310,12 +16974,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -16324,7 +16988,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16356,20 +17020,20 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16396,12 +17060,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_cbranch_execnz .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16410,7 +17074,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16437,12 +17101,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16451,7 +17115,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16476,12 +17140,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -16490,7 +17154,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16518,12 +17182,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16539,7 +17203,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16561,12 +17225,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16582,7 +17246,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16605,15 +17269,17 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cbranch_execnz .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 915ce7433f5b0c..91a8ac7c935b61 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32: +define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -27,7 +27,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -52,7 +52,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -62,7 +62,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -72,7 +72,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -96,7 +96,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -120,7 +120,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -144,7 +144,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -157,7 +157,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -170,12 +170,12 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -188,7 +188,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -213,7 +213,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -223,7 +223,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -233,7 +233,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -257,7 +257,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -281,7 +281,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -306,7 +306,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -319,7 +319,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -333,12 +333,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -351,7 +351,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -376,7 +376,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -386,7 +386,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -396,7 +396,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -420,7 +420,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -444,7 +444,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -469,7 +469,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -482,7 +482,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -496,12 +496,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32: +define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -514,7 +514,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -538,7 +538,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -548,7 +548,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -558,7 +558,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -581,7 +581,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -604,7 +604,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -627,7 +627,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -639,7 +639,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -651,12 +651,12 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -669,7 +669,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -693,7 +693,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -703,7 +703,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -713,7 +713,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -736,7 +736,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -759,7 +759,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -784,7 +784,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -796,7 +796,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -809,12 +809,12 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -827,7 +827,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -851,7 +851,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -861,7 +861,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -871,7 +871,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -894,7 +894,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -917,7 +917,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -942,7 +942,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -954,7 +954,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -967,12 +967,12 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1003,7 +1003,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1028,7 +1028,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1056,7 +1056,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1082,7 +1082,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1108,7 +1108,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1132,7 +1132,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1157,7 +1157,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1187,7 +1187,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1219,12 +1219,12 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1254,7 +1254,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1278,7 +1278,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1305,7 +1305,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1330,7 +1330,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1355,7 +1355,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1378,7 +1378,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -1403,7 +1403,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1432,7 +1432,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1463,16 +1463,12 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; float with ftz/daz -; -------------------------------------------------------------------- - -define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1485,7 +1481,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -1510,7 +1506,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1520,7 +1516,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1530,7 +1526,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -1554,7 +1550,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -1578,7 +1574,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -1602,7 +1598,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1615,7 +1611,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1628,12 +1624,12 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1641,15 +1637,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1659,7 +1655,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1671,30 +1667,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1703,7 +1699,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1715,10 +1711,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1727,7 +1723,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1739,64 +1735,66 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; -------------------------------------------------------------------- +; float with ftz/daz +; -------------------------------------------------------------------- + +define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1804,15 +1802,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1822,7 +1820,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1834,30 +1832,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1866,7 +1864,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1878,10 +1876,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1890,7 +1888,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1902,64 +1900,62 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1967,154 +1963,162 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2122,157 +2126,162 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2280,15 +2289,15 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2297,7 +2306,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2309,30 +2318,30 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048 +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2340,7 +2349,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2352,10 +2361,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2363,7 +2372,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2375,11 +2384,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2400,322 +2407,210 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst - ret float %result + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2723,10 +2618,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 @@ -2736,62 +2631,30 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2799,10 +2662,8 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2813,10 +2674,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2824,7 +2685,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2836,11 +2697,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2861,7 +2722,222 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 +; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -2870,77 +2946,666 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX7-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX6-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64: +define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -2948,35 +3613,35 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -2984,7 +3649,7 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -2992,89 +3657,89 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -3082,13 +3747,13 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3096,373 +3761,355 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } -define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64: +define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3470,34 +4117,34 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3506,41 +4153,41 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] @@ -3548,18 +4195,20 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3572,373 +4221,388 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result } -define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret double %result } ; -------------------------------------------------------------------- ; half ; -------------------------------------------------------------------- -define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16: +define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3956,7 +4620,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -3977,13 +4641,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -3996,7 +4660,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4012,13 +4676,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -4032,7 +4696,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4054,13 +4718,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -4072,7 +4736,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4089,13 +4753,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -4108,7 +4772,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4123,13 +4787,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -4142,7 +4806,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4157,13 +4821,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -4176,7 +4840,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4192,13 +4856,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4215,7 +4879,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4234,14 +4898,14 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -4258,7 +4922,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4277,19 +4941,19 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4308,7 +4972,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4329,13 +4993,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -4350,7 +5014,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4366,13 +5030,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4387,7 +5051,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4409,13 +5073,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4428,7 +5092,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4445,13 +5109,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4465,7 +5129,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4480,13 +5144,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4500,7 +5164,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4515,13 +5179,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -4535,7 +5199,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4551,13 +5215,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4575,7 +5239,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4594,14 +5258,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4619,7 +5283,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4639,7 +5303,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -4647,12 +5311,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4671,7 +5335,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4692,13 +5356,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -4714,7 +5378,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4730,13 +5394,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: s_cbranch_execnz .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4751,7 +5415,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4773,13 +5437,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4792,7 +5456,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4809,13 +5473,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4829,7 +5493,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4844,13 +5508,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4864,7 +5528,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4879,13 +5543,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4899,7 +5563,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4915,13 +5579,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -4939,7 +5603,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4958,14 +5622,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -4983,7 +5647,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5003,7 +5667,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -5011,12 +5675,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16: +define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5034,7 +5698,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5055,12 +5719,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -5073,7 +5737,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5089,12 +5753,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: s_cbranch_execnz .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -5108,7 +5772,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5130,12 +5794,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_cbranch_execnz .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -5147,7 +5811,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5164,12 +5828,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_cbranch_execnz .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -5182,7 +5846,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5197,12 +5861,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -5215,7 +5879,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5230,12 +5894,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cbranch_execnz .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -5248,7 +5912,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5264,12 +5928,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cbranch_execnz .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5286,7 +5950,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5305,12 +5969,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -5327,7 +5991,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5347,17 +6011,17 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5376,7 +6040,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5397,12 +6061,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -5417,7 +6081,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5433,12 +6097,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5453,7 +6117,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5475,12 +6139,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5493,7 +6157,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5510,12 +6174,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5529,7 +6193,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5544,12 +6208,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5563,7 +6227,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5578,12 +6242,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -5597,7 +6261,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5613,12 +6277,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5636,7 +6300,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5655,12 +6319,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5678,7 +6342,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5698,18 +6362,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5728,7 +6392,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5749,12 +6413,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -5770,7 +6434,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5786,12 +6450,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5806,7 +6470,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5828,12 +6492,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5846,7 +6510,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5863,12 +6527,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_cbranch_execnz .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5882,7 +6546,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5897,12 +6561,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5916,7 +6580,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5931,12 +6595,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5950,7 +6614,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5966,12 +6630,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -5989,7 +6653,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6008,12 +6672,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6031,7 +6695,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6051,18 +6715,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6072,7 +6736,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -6090,20 +6754,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -6117,19 +6781,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_cbranch_execnz .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -6148,19 +6812,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_cbranch_execnz .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -6176,20 +6840,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_cbranch_execnz .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -6202,20 +6866,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -6228,13 +6892,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6242,7 +6906,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6256,12 +6920,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6272,7 +6936,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6289,13 +6953,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6306,7 +6970,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6324,19 +6988,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6346,7 +7010,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -6364,19 +7028,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6390,18 +7054,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6420,18 +7084,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_cbranch_execnz .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6447,19 +7111,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6472,19 +7136,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6497,12 +7161,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -6510,7 +7174,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6524,12 +7188,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6540,7 +7204,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6557,12 +7221,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6573,7 +7237,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6591,18 +7255,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6621,7 +7285,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -6642,13 +7306,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -6663,7 +7327,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -6679,13 +7343,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cbranch_execnz .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6700,7 +7364,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -6722,13 +7386,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6741,7 +7405,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6758,13 +7422,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6778,7 +7442,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -6795,13 +7459,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6815,7 +7479,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -6830,13 +7494,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6850,7 +7514,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6866,13 +7530,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6890,7 +7554,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6909,14 +7573,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6934,7 +7598,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6954,7 +7618,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -6962,12 +7626,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6986,7 +7650,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7007,12 +7671,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7027,7 +7691,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7043,12 +7707,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7063,7 +7727,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7085,12 +7749,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_cbranch_execnz .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7103,7 +7767,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7120,12 +7784,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_cbranch_execnz .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7139,7 +7803,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7156,12 +7820,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7175,7 +7839,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7190,12 +7854,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -7209,7 +7873,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7225,12 +7889,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7248,7 +7912,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7267,12 +7931,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7290,7 +7954,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7310,13 +7974,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -7324,8 +7988,8 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; bfloat ; -------------------------------------------------------------------- -define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16: +define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7342,7 +8006,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7370,13 +8034,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -7390,7 +8054,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7412,13 +8076,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_cbranch_execnz .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -7432,7 +8096,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7461,13 +8125,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -7479,7 +8143,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7500,13 +8164,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_cbranch_execnz .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -7520,7 +8184,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7539,13 +8203,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -7559,7 +8223,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7578,13 +8242,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_cbranch_execnz .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -7597,7 +8261,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -7618,13 +8282,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_cbranch_execnz .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7641,7 +8305,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7661,14 +8325,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -7685,7 +8349,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7705,19 +8369,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7736,7 +8400,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7764,13 +8428,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7786,7 +8450,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7808,13 +8472,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cbranch_execnz .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7830,7 +8494,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7859,13 +8523,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7878,7 +8542,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7899,13 +8563,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_cbranch_execnz .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7920,7 +8584,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7939,13 +8603,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7960,7 +8624,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7979,13 +8643,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cbranch_execnz .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -7999,7 +8663,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8020,13 +8684,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8044,7 +8708,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8064,14 +8728,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8089,7 +8753,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8110,7 +8774,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8118,12 +8782,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8142,7 +8806,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8170,13 +8834,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -8193,7 +8857,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8215,13 +8879,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_cbranch_execnz .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8237,7 +8901,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8266,13 +8930,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_cbranch_execnz .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8285,7 +8949,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8306,13 +8970,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_cbranch_execnz .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8327,7 +8991,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8346,13 +9010,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8367,7 +9031,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8386,13 +9050,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_cbranch_execnz .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8406,7 +9070,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8427,13 +9091,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8451,7 +9115,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8471,14 +9135,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8496,7 +9160,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8517,7 +9181,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8525,12 +9189,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16: +define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8547,7 +9211,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8574,12 +9238,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -8593,7 +9257,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8615,12 +9279,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cbranch_execnz .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -8634,7 +9298,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8662,12 +9326,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_cbranch_execnz .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -8679,7 +9343,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8700,12 +9364,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_cbranch_execnz .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -8719,7 +9383,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8738,12 +9402,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -8757,7 +9421,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8776,12 +9440,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cbranch_execnz .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -8794,7 +9458,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8815,12 +9479,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -8837,7 +9501,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8857,12 +9521,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -8879,7 +9543,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8900,17 +9564,17 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8929,7 +9593,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8956,12 +9620,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -8977,7 +9641,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8999,12 +9663,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_cbranch_execnz .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9020,7 +9684,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9048,12 +9712,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_cbranch_execnz .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9066,7 +9730,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9087,12 +9751,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_cbranch_execnz .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9107,7 +9771,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9126,12 +9790,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9146,7 +9810,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9165,12 +9829,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_cbranch_execnz .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -9184,7 +9848,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9205,12 +9869,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9228,7 +9892,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9248,12 +9912,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9271,7 +9935,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9292,18 +9956,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9322,7 +9986,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9349,12 +10013,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -9371,7 +10035,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9393,12 +10057,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_cbranch_execnz .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9414,7 +10078,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9442,12 +10106,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_cbranch_execnz .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9460,7 +10124,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9481,12 +10145,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_cbranch_execnz .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9501,7 +10165,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9520,12 +10184,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9540,7 +10204,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9559,12 +10223,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_cbranch_execnz .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9578,7 +10242,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9599,12 +10263,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9622,7 +10286,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9642,12 +10306,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9665,7 +10329,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9686,18 +10350,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9707,7 +10371,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -9732,13 +10396,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9746,7 +10410,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -9767,20 +10431,20 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_cbranch_execnz .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -9806,19 +10470,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_cbranch_execnz .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -9839,13 +10503,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_cbranch_execnz .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9853,7 +10517,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -9872,13 +10536,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9886,7 +10550,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -9905,13 +10569,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_cbranch_execnz .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -9919,7 +10583,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -9939,12 +10603,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -9955,7 +10619,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -9973,13 +10637,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -9990,7 +10654,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10009,19 +10673,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10031,7 +10695,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10055,12 +10719,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10068,7 +10732,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10089,19 +10753,19 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cbranch_execnz .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10126,18 +10790,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_cbranch_execnz .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10158,12 +10822,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_cbranch_execnz .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10171,7 +10835,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10190,12 +10854,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10203,7 +10867,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10222,12 +10886,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cbranch_execnz .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -10235,7 +10899,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10255,12 +10919,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -10271,7 +10935,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10289,12 +10953,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10305,7 +10969,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10324,18 +10988,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10354,7 +11018,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -10382,13 +11046,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10404,7 +11068,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -10426,13 +11090,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cbranch_execnz .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10448,7 +11112,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -10477,13 +11141,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_cbranch_execnz .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10496,7 +11160,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10517,13 +11181,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_cbranch_execnz .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10538,7 +11202,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10559,13 +11223,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10580,7 +11244,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10599,13 +11263,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cbranch_execnz .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -10619,7 +11283,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10640,13 +11304,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cbranch_execnz .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10664,7 +11328,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10684,14 +11348,14 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10709,7 +11373,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10730,7 +11394,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -10738,12 +11402,12 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10762,7 +11426,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10789,12 +11453,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10810,7 +11474,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10832,12 +11496,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cbranch_execnz .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10853,7 +11517,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10881,12 +11545,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_cbranch_execnz .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10899,7 +11563,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10920,12 +11584,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_cbranch_execnz .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -10940,7 +11604,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10961,12 +11625,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -10981,7 +11645,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11000,12 +11664,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cbranch_execnz .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -11019,7 +11683,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11040,12 +11704,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11063,7 +11727,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11083,12 +11747,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11106,7 +11770,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11127,13 +11791,13 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11141,8 +11805,8 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11152,7 +11816,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11167,19 +11831,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11193,19 +11857,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_cbranch_execnz .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11221,19 +11885,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_cbranch_execnz .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11247,19 +11911,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_cbranch_execnz .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11271,19 +11935,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11295,20 +11959,20 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_cbranch_execnz .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -11323,13 +11987,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11346,7 +12010,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11371,14 +12035,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11395,7 +12059,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11421,19 +12085,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11443,7 +12107,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11458,19 +12122,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11484,19 +12148,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cbranch_execnz .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11512,19 +12176,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_cbranch_execnz .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11538,19 +12202,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_cbranch_execnz .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11562,19 +12226,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11586,13 +12250,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cbranch_execnz .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -11601,7 +12265,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11616,12 +12280,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cbranch_execnz .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11638,7 +12302,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11663,14 +12327,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11687,7 +12351,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11713,7 +12377,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -11721,12 +12385,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11736,7 +12400,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11751,19 +12415,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11777,19 +12441,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_cbranch_execnz .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11805,19 +12469,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_cbranch_execnz .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11831,19 +12495,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_cbranch_execnz .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11855,19 +12519,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11879,13 +12543,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_cbranch_execnz .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -11894,7 +12558,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11909,12 +12573,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -11935,7 +12599,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -11960,12 +12624,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -11986,7 +12650,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12012,18 +12676,18 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16: +define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12033,7 +12697,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12048,18 +12712,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12073,18 +12737,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12100,18 +12764,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_cbranch_execnz .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12125,18 +12789,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_cbranch_execnz .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12148,18 +12812,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12171,19 +12835,19 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12198,12 +12862,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12220,7 +12884,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12245,12 +12909,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12267,7 +12931,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12293,17 +12957,17 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12313,7 +12977,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12328,18 +12992,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12353,18 +13017,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12380,18 +13044,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12405,18 +13069,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_cbranch_execnz .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12428,18 +13092,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12451,12 +13115,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -12465,7 +13129,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12480,12 +13144,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12502,7 +13166,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12527,12 +13191,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12549,7 +13213,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12575,18 +13239,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12596,7 +13260,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12611,18 +13275,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12636,18 +13300,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12663,18 +13327,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12688,18 +13352,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_cbranch_execnz .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12711,18 +13375,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12734,12 +13398,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -12748,7 +13412,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12763,12 +13427,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -12789,7 +13453,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12814,12 +13478,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12840,7 +13504,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12866,18 +13530,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12887,7 +13551,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -12902,19 +13566,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -12928,19 +13592,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_cbranch_execnz .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -12956,19 +13620,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -12982,19 +13646,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_cbranch_execnz .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -13008,19 +13672,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -13032,13 +13696,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_cbranch_execnz .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -13047,7 +13711,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -13062,12 +13726,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13084,7 +13748,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13109,14 +13773,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13133,7 +13797,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13159,7 +13823,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -13167,12 +13831,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13182,7 +13846,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -13197,18 +13861,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13222,18 +13886,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13249,18 +13913,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13274,18 +13938,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_cbranch_execnz .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13299,18 +13963,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13322,12 +13986,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -13336,7 +14000,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -13351,12 +14015,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13373,7 +14037,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13398,12 +14062,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13420,7 +14084,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13446,13 +14110,13 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13460,8 +14124,8 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13472,7 +14136,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13504,13 +14168,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -13519,7 +14183,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13546,13 +14210,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -13561,7 +14225,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13594,21 +14258,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -13635,13 +14299,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_cbranch_execnz .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -13650,7 +14314,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -13675,13 +14339,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -13690,7 +14354,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -13715,20 +14379,20 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -13756,13 +14420,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13778,7 +14442,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13800,14 +14464,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13823,7 +14487,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13846,19 +14510,19 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13869,7 +14533,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13901,13 +14565,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -13916,7 +14580,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13943,13 +14607,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -13958,7 +14622,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13991,21 +14655,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14032,13 +14696,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_cbranch_execnz .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14047,7 +14711,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14072,13 +14736,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14087,7 +14751,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14112,13 +14776,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -14127,7 +14791,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14155,12 +14819,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14176,7 +14840,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14198,14 +14862,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14221,7 +14885,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14244,7 +14908,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -14252,12 +14916,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14268,7 +14932,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14300,13 +14964,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14315,7 +14979,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -14342,13 +15006,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_cbranch_execnz .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -14357,7 +15021,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14390,21 +15054,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14431,13 +15095,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_cbranch_execnz .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14446,7 +15110,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14471,13 +15135,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14486,7 +15150,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14511,13 +15175,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_cbranch_execnz .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -14526,7 +15190,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14554,12 +15218,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -14579,7 +15243,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14601,12 +15265,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -14626,7 +15290,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14649,18 +15313,18 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16: +define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14671,7 +15335,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14702,12 +15366,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -14716,7 +15380,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14743,12 +15407,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -14757,7 +15421,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14789,20 +15453,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14829,12 +15493,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_cbranch_execnz .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -14843,7 +15507,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14868,12 +15532,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -14882,7 +15546,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14907,19 +15571,19 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14947,12 +15611,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14968,7 +15632,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -14990,12 +15654,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15011,7 +15675,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15034,17 +15698,17 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15055,7 +15719,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15086,12 +15750,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15100,7 +15764,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15127,12 +15791,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15141,7 +15805,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15173,20 +15837,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15213,12 +15877,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_cbranch_execnz .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15227,7 +15891,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15252,12 +15916,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15266,7 +15930,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15291,12 +15955,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -15305,7 +15969,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15333,12 +15997,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15354,7 +16018,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15376,12 +16040,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15397,7 +16061,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15420,18 +16084,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cbranch_execnz .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15442,7 +16106,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15473,12 +16137,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15487,7 +16151,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15514,12 +16178,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -15528,7 +16192,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15560,20 +16224,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15600,12 +16264,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_cbranch_execnz .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15614,7 +16278,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15639,12 +16303,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15653,7 +16317,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15678,12 +16342,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -15692,7 +16356,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15720,12 +16384,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -15745,7 +16409,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15767,12 +16431,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -15792,7 +16456,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15815,18 +16479,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cbranch_execnz .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15837,7 +16501,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -15869,13 +16533,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_cbranch_execnz .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15884,7 +16548,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -15911,13 +16575,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_cbranch_execnz .LBB60_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15926,7 +16590,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -15959,21 +16623,21 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16000,13 +16664,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_cbranch_execnz .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16015,7 +16679,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16042,13 +16706,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16057,7 +16721,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16082,13 +16746,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_cbranch_execnz .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -16097,7 +16761,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16125,12 +16789,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_cbranch_execnz .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16146,7 +16810,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16168,14 +16832,14 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16191,7 +16855,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16214,7 +16878,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cbranch_execnz .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -16222,12 +16886,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16238,7 +16902,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16269,12 +16933,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16283,7 +16947,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16310,12 +16974,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -16324,7 +16988,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16356,20 +17020,20 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16396,12 +17060,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_cbranch_execnz .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16410,7 +17074,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16437,12 +17101,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16451,7 +17115,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16476,12 +17140,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -16490,7 +17154,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16518,12 +17182,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16539,7 +17203,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16561,12 +17225,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16582,7 +17246,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16605,15 +17269,17 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cbranch_execnz .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll index 1f0ae39082865c..0612383c3f90b1 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll @@ -6,25 +6,25 @@ ; FIXME: This will still fail for gfx6/7 and gfx10 subtargets. ; DISASSEMBLY-VI: .long 0xdd348000 // {{[0-9A-Z]+}}: DD348000 -; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v1, v0, v0, vcc // {{[0-9A-Z]+}}: 00020100 +; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc // {{[0-9A-Z]+}}: 00000100 define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #0 { ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[0:1], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: .LBB0_2: diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll index c790187f9d108a..38b9c5df7faa1b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll @@ -49,8 +49,8 @@ define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) { ; R600-LABEL: available_externally_test -; GCN-PAL: s_mov_b32 s1, available_externally@abs32@hi -; GCN-PAL: s_mov_b32 s0, available_externally@abs32@lo +; GCN-PAL: s_mov_b32 s3, available_externally@abs32@hi +; GCN-PAL: s_mov_b32 s2, available_externally@abs32@lo define amdgpu_kernel void @available_externally_test(ptr addrspace(1) %out) { %ptr = getelementptr [256 x i32], ptr addrspace(4) @available_externally, i32 0, i32 1 %val = load i32, ptr addrspace(4) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll index 7f6a3ad5c93460..b8ecbae3d3114c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half8: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v4, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -18,7 +18,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half8: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -28,7 +28,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half8: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -74,7 +74,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half6: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -84,7 +84,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half6: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -94,7 +94,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half6: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -132,7 +132,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half4: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v2, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -144,7 +144,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half4: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half4: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -188,7 +188,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half2: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dword v1, v0, s[0:1] @@ -198,7 +198,7 @@ define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half2: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] @@ -208,7 +208,7 @@ define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll index e54cd64798a682..f709eae990bda2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -13,7 +13,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: test_move_load_address_to_vgpr: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v0, v1, s[0:1] glc @@ -54,7 +54,7 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: test_move_load_address_to_vgpr_d16_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index 9bee539b1e4e5c..dac3a3db7b450b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,8 +19,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_add_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -32,12 +32,12 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_add_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -50,8 +50,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -65,14 +65,14 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_add_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s1, -1 +; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s3, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -80,12 +80,12 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_add_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:-4096 +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -98,8 +98,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -112,8 +112,8 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_add_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -126,12 +126,12 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_add_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:3232 +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -144,29 +144,29 @@ entry: define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xdeac -; VI-NEXT: s_addc_u32 s1, s1, 0xabcd +; VI-NEXT: s_add_u32 s0, s2, 0xdeac +; VI-NEXT: s_addc_u32 s1, s3, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -174,12 +174,12 @@ define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_add_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -195,8 +195,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -213,29 +213,29 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -251,9 +251,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -268,18 +268,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_add_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -287,12 +287,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_add_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -310,9 +310,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -332,22 +332,22 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_add_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -356,12 +356,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_add_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -381,8 +381,8 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -394,8 +394,8 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_add_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -407,12 +407,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_add_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -424,8 +424,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -442,8 +442,8 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_add_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -460,11 +460,11 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_add_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -479,9 +479,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -496,16 +496,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_add_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -513,12 +513,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_add_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -535,9 +535,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -557,20 +557,20 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -579,12 +579,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -603,8 +603,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -616,8 +616,8 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_and_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -629,12 +629,12 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_and_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -647,8 +647,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -665,29 +665,29 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_and_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -703,9 +703,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -720,18 +720,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_and_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -739,12 +739,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_and_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -762,9 +762,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -784,22 +784,22 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_and_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -808,12 +808,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_and_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -833,8 +833,8 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -846,8 +846,8 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_and_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -859,12 +859,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_and_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -876,8 +876,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -894,8 +894,8 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_and_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -912,11 +912,11 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_and_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -931,9 +931,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -948,16 +948,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_and_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -965,12 +965,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_and_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -987,9 +987,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1009,20 +1009,20 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1031,12 +1031,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1055,8 +1055,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1068,8 +1068,8 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_sub_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1081,12 +1081,12 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_sub_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1099,8 +1099,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,29 +1117,29 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_sub_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1155,9 +1155,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1172,18 +1172,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_sub_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1191,12 +1191,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_sub_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1214,9 +1214,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1236,22 +1236,22 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_sub_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1260,12 +1260,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1285,8 +1285,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1298,8 +1298,8 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_sub_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1311,12 +1311,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_sub_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1328,8 +1328,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1346,8 +1346,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_sub_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,11 +1364,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_sub_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1383,9 +1383,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1400,16 +1400,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_sub_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1417,12 +1417,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_sub_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1439,9 +1439,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1461,20 +1461,20 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1483,12 +1483,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1507,8 +1507,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1520,8 +1520,8 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_max_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1533,12 +1533,12 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_max_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1551,8 +1551,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1568,28 +1568,28 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -1604,9 +1604,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1619,29 +1619,29 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1657,9 +1657,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1678,20 +1678,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1701,12 +1701,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1725,8 +1725,8 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1736,8 +1736,8 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_max_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1747,12 +1747,12 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_max_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -1762,8 +1762,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1779,8 +1779,8 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_max_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1796,11 +1796,11 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_max_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -1814,9 +1814,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1829,27 +1829,27 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1864,9 +1864,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1885,18 +1885,18 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1906,12 +1906,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1929,8 +1929,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1940,8 +1940,8 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umax_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1951,12 +1951,12 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_umax_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -1967,8 +1967,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1984,28 +1984,28 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2020,9 +2020,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2035,29 +2035,29 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2073,9 +2073,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2094,20 +2094,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2117,12 +2117,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2141,8 +2141,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2152,8 +2152,8 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_umax_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2163,12 +2163,12 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_umax_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2178,8 +2178,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2195,8 +2195,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umax_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2212,11 +2212,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umax_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2230,9 +2230,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2245,27 +2245,27 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umax_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2280,9 +2280,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2301,18 +2301,18 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2322,12 +2322,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2345,8 +2345,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2356,8 +2356,8 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_min_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2367,12 +2367,12 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_min_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2383,8 +2383,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2400,28 +2400,28 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2436,9 +2436,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2451,29 +2451,29 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2489,9 +2489,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2510,20 +2510,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2533,12 +2533,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2557,8 +2557,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2568,8 +2568,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2579,12 +2579,12 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2594,8 +2594,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2611,8 +2611,8 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_min_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2628,11 +2628,11 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_min_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2646,9 +2646,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2661,27 +2661,27 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_min_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2696,9 +2696,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2717,18 +2717,18 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2738,12 +2738,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2761,8 +2761,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2772,8 +2772,8 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umin_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2783,12 +2783,12 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_umin_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2799,8 +2799,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2816,28 +2816,28 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2852,9 +2852,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2867,29 +2867,29 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umin_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2905,9 +2905,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2926,20 +2926,20 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umin_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2949,12 +2949,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2973,8 +2973,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2984,8 +2984,8 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_umin_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2995,12 +2995,12 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_umin_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3010,8 +3010,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3027,8 +3027,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umin_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3044,11 +3044,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umin_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -3062,9 +3062,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3077,27 +3077,27 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umin_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3112,9 +3112,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3133,18 +3133,18 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3154,12 +3154,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3177,8 +3177,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3190,8 +3190,8 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_or_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3203,12 +3203,12 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_or_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3221,8 +3221,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3239,29 +3239,29 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_or_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3277,9 +3277,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3294,18 +3294,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; VI-LABEL: atomic_or_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3313,12 +3313,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; GFX9-LABEL: atomic_or_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3336,9 +3336,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3358,22 +3358,22 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out ; ; VI-LABEL: atomic_or_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3382,12 +3382,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX9-LABEL: atomic_or_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3407,8 +3407,8 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3420,8 +3420,8 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_or_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3433,12 +3433,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_or_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3450,8 +3450,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3468,8 +3468,8 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: atomic_or_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3486,11 +3486,11 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: atomic_or_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3505,9 +3505,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3522,16 +3522,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_or_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3539,12 +3539,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_or_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3561,9 +3561,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3583,20 +3583,20 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3605,12 +3605,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3629,8 +3629,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3642,8 +3642,8 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3655,12 +3655,12 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3673,8 +3673,8 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %in) { ; SI-LABEL: atomic_xchg_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3686,8 +3686,8 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; VI-LABEL: atomic_xchg_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3699,12 +3699,12 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; GFX9-LABEL: atomic_xchg_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3717,8 +3717,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3735,29 +3735,29 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3773,9 +3773,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3790,18 +3790,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_xchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3809,12 +3809,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_xchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3832,9 +3832,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3854,22 +3854,22 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3878,12 +3878,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3903,8 +3903,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3916,8 +3916,8 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3929,12 +3929,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3946,8 +3946,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3964,8 +3964,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_xchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3982,11 +3982,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_xchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4001,9 +4001,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4018,16 +4018,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4035,12 +4035,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -4057,9 +4057,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4079,20 +4079,20 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4101,12 +4101,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -4125,7 +4125,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4140,7 +4140,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4155,7 +4155,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4173,8 +4173,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4192,31 +4192,31 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4233,10 +4233,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s7, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4252,19 +4252,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dword s6, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4273,17 +4273,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4298,10 +4298,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s10, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0x11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s10, s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4309,8 +4309,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 offset:16 glc @@ -4322,24 +4322,24 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s9, s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4348,13 +4348,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 @@ -4376,7 +4376,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4391,7 +4391,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; VI-LABEL: atomic_cmpxchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4406,7 +4406,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; GFX9-LABEL: atomic_cmpxchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4423,8 +4423,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4442,8 +4442,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: atomic_cmpxchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4461,12 +4461,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4482,10 +4482,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s7, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4501,17 +4501,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dword s6, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4520,17 +4520,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4544,10 +4544,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s10, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0x11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s10, s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4555,8 +4555,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 glc @@ -4568,22 +4568,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s9, s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4592,13 +4592,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 @@ -4619,8 +4619,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4632,8 +4632,8 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_xor_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4645,12 +4645,12 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_xor_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4663,8 +4663,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4681,29 +4681,29 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xor_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4719,9 +4719,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4736,18 +4736,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_xor_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4755,12 +4755,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_xor_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -4778,9 +4778,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4800,22 +4800,22 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_xor_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4824,12 +4824,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -4849,8 +4849,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4862,8 +4862,8 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xor_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4875,12 +4875,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xor_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4892,8 +4892,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4910,8 +4910,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_xor_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4928,11 +4928,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_xor_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4947,9 +4947,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4964,16 +4964,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_xor_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4981,12 +4981,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_xor_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -5003,9 +5003,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5025,20 +5025,20 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -5047,12 +5047,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -5071,7 +5071,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5087,7 +5087,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5105,7 +5105,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5123,7 +5123,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5141,7 +5141,7 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i32_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5159,7 +5159,7 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i32_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:-512 glc @@ -5177,7 +5177,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,7 +5193,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5211,7 +5211,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5229,7 +5229,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5245,7 +5245,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; VI-LABEL: atomic_load_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5261,7 +5261,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -5278,8 +5278,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5298,8 +5298,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5320,11 +5320,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5343,8 +5343,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5363,8 +5363,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5383,11 +5383,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -5405,8 +5405,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5425,8 +5425,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5447,11 +5447,11 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5470,8 +5470,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5481,25 +5481,25 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -5510,8 +5510,8 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5521,23 +5521,23 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 @@ -5547,8 +5547,8 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5558,23 +5558,23 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic float %in, ptr addrspace(1) %out seq_cst, align 4 @@ -5584,8 +5584,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s2, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5598,8 +5598,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace ; ; VI-LABEL: atomic_store_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5614,14 +5614,14 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace ; ; GFX9-LABEL: atomic_store_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5634,8 +5634,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s2, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5648,8 +5648,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspa ; ; VI-LABEL: atomic_store_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5664,14 +5664,14 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspa ; ; GFX9-LABEL: atomic_store_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5684,8 +5684,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s8, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 @@ -5699,8 +5699,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5713,14 +5713,14 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5732,8 +5732,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s8, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 @@ -5747,8 +5747,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) % ; ; VI-LABEL: atomic_store_f32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5761,14 +5761,14 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) % ; ; GFX9-LABEL: atomic_store_f32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5780,7 +5780,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5796,7 +5796,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; VI-LABEL: atomic_load_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5812,7 +5812,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; GFX9-LABEL: atomic_load_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc @@ -5830,7 +5830,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5848,7 +5848,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; VI-LABEL: atomic_load_i8_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5866,7 +5866,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; GFX9-LABEL: atomic_load_i8_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:-512 glc @@ -5884,8 +5884,8 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5895,25 +5895,25 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) ; ; VI-LABEL: atomic_store_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %out, i64 16 @@ -5924,8 +5924,8 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5935,23 +5935,23 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic i8 %in, ptr addrspace(1) %out seq_cst, align 1 @@ -5961,7 +5961,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5977,7 +5977,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5993,7 +5993,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -6011,7 +6011,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -6029,7 +6029,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i16_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6047,7 +6047,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i16_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc @@ -6065,8 +6065,8 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6076,25 +6076,25 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %out, i64 8 @@ -6105,8 +6105,8 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6116,23 +6116,23 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic i16 %in, ptr addrspace(1) %out seq_cst, align 2 @@ -6142,8 +6142,8 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6153,25 +6153,25 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %o ; ; VI-LABEL: atomic_store_f16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr half, ptr addrspace(1) %out, i64 8 @@ -6182,8 +6182,8 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6193,23 +6193,23 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic half %in, ptr addrspace(1) %out seq_cst, align 2 @@ -6219,8 +6219,8 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6230,25 +6230,25 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) ; ; VI-LABEL: atomic_store_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8 store atomic bfloat %in, ptr addrspace(1) %gep seq_cst, align 2 @@ -6258,8 +6258,8 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6269,23 +6269,23 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) ; ; VI-LABEL: atomic_store_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2 ret void @@ -6294,8 +6294,8 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6307,8 +6307,8 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_inc_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6320,12 +6320,12 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_inc_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6338,8 +6338,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -6353,14 +6353,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s1, -1 +; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s3, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6368,12 +6368,12 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_inc_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:-4096 +; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6386,8 +6386,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6400,8 +6400,8 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_inc_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6414,12 +6414,12 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_inc_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:3232 +; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6432,29 +6432,29 @@ entry: define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xdeac -; VI-NEXT: s_addc_u32 s1, s1, 0xabcd +; VI-NEXT: s_add_u32 s0, s2, 0xdeac +; VI-NEXT: s_addc_u32 s1, s3, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6462,12 +6462,12 @@ define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_inc_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6482,8 +6482,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_inc_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6500,29 +6500,29 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_inc_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6538,9 +6538,9 @@ entry: define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -6555,18 +6555,18 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_inc_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6574,12 +6574,12 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_inc_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -6597,9 +6597,9 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -6619,22 +6619,22 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_inc_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -6643,12 +6643,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_inc_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -6668,8 +6668,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6681,8 +6681,8 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_dec_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6694,12 +6694,12 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_dec_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6712,8 +6712,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -6727,14 +6727,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_dec_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s1, -1 +; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s3, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6742,12 +6742,12 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_dec_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:-4096 +; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6760,8 +6760,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6774,8 +6774,8 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_dec_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6788,12 +6788,12 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_dec_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:3232 +; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6806,29 +6806,29 @@ entry: define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xdeac -; VI-NEXT: s_addc_u32 s1, s1, 0xabcd +; VI-NEXT: s_add_u32 s0, s2, 0xdeac +; VI-NEXT: s_addc_u32 s1, s3, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6836,12 +6836,12 @@ define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_dec_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6856,8 +6856,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_dec_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6874,29 +6874,29 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_dec_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6912,9 +6912,9 @@ entry: define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -6929,18 +6929,18 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_dec_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6948,12 +6948,12 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_dec_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -6971,9 +6971,9 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -6993,22 +6993,22 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_dec_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -7017,12 +7017,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_dec_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -7042,7 +7042,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7058,7 +7058,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7074,7 +7074,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -7091,7 +7091,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7109,7 +7109,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_f16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7127,7 +7127,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_f16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc @@ -7144,7 +7144,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7160,7 +7160,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; VI-LABEL: atomic_load_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7176,7 +7176,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; GFX9-LABEL: atomic_load_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -7193,7 +7193,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7211,7 +7211,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; VI-LABEL: atomic_load_bf16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7229,7 +7229,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_bf16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index a7ba8a084272b4..516c92f1640eae 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4616,7 +4616,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4648,7 +4648,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4679,7 +4679,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -4714,8 +4714,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4753,8 +4753,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -4789,24 +4789,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -4829,7 +4829,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4861,7 +4861,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4890,7 +4890,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -4924,8 +4924,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4963,8 +4963,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -4997,24 +4997,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -5869,7 +5869,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -5901,7 +5901,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -5932,7 +5932,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -5967,8 +5967,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6006,8 +6006,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -6042,24 +6042,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -6082,8 +6082,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6121,8 +6121,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -6155,24 +6155,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -7860,7 +7860,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -7892,7 +7892,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -7923,7 +7923,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -7958,8 +7958,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -7997,8 +7997,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -8033,24 +8033,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -8073,36 +8073,36 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[0:1], 0x0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s3, s[4:5], 0x0 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB130_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s6, v1 +; SI-NEXT: v_min_i32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -8126,24 +8126,24 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB130_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -8155,8 +8155,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -8194,8 +8194,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -8228,24 +8228,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 3bf52a56fef5b5..68482ca3eaf877 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_add_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -20,7 +20,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_add_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -33,7 +33,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_add_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -45,7 +45,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -62,8 +62,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_add_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -81,8 +81,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -100,12 +100,12 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -115,8 +115,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_add_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -137,8 +137,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -154,8 +154,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_add_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -173,12 +173,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_add_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -190,8 +190,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_add_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -212,7 +212,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -233,7 +233,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_add_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -256,7 +256,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -272,7 +272,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -297,7 +297,7 @@ entry: define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_add_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -312,7 +312,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_add_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -327,7 +327,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_add_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -339,7 +339,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -355,8 +355,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_add_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -374,8 +374,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_add_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -393,12 +393,12 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_add_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -408,8 +408,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_add_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -429,8 +429,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -446,8 +446,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_add_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -463,12 +463,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_add_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -480,8 +480,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_add_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -501,7 +501,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -522,7 +522,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -543,7 +543,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -559,7 +559,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -583,7 +583,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_and_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -596,7 +596,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_and_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -609,7 +609,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_and_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -621,7 +621,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -638,8 +638,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_and_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -657,8 +657,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -676,12 +676,12 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -691,8 +691,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_and_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -713,8 +713,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -730,8 +730,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_and_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -749,12 +749,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_and_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -766,8 +766,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_and_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -788,7 +788,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -809,7 +809,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_and_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -832,7 +832,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -848,7 +848,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -873,7 +873,7 @@ entry: define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_and_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -888,7 +888,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_and_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -903,7 +903,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_and_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -915,7 +915,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -931,8 +931,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_and_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -950,8 +950,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_and_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -969,12 +969,12 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_and_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -984,8 +984,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_and_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1005,8 +1005,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1022,8 +1022,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_and_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1039,12 +1039,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_and_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1056,8 +1056,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_and_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1077,7 +1077,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1119,7 +1119,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1135,7 +1135,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1159,7 +1159,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_sub_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_sub_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_sub_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1197,7 +1197,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1214,8 +1214,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_sub_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1233,8 +1233,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1252,12 +1252,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1267,8 +1267,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_sub_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1289,8 +1289,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1306,8 +1306,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_sub_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1325,12 +1325,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_sub_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1342,8 +1342,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_sub_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1364,7 +1364,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1385,7 +1385,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_sub_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1424,7 +1424,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1449,7 +1449,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_sub_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1464,7 +1464,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_sub_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_sub_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1491,7 +1491,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1507,8 +1507,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_sub_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1526,8 +1526,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_sub_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1545,12 +1545,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_sub_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1560,8 +1560,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_sub_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1581,8 +1581,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1598,8 +1598,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_sub_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1615,12 +1615,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_sub_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1632,8 +1632,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_sub_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1653,7 +1653,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1674,7 +1674,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1695,7 +1695,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1711,7 +1711,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1735,7 +1735,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_max_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_max_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1757,7 +1757,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_max_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1767,7 +1767,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1784,8 +1784,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_max_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1802,8 +1802,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1820,12 +1820,12 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -1834,8 +1834,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_max_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1856,8 +1856,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1871,8 +1871,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1888,12 +1888,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1903,8 +1903,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_max_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1925,7 +1925,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1945,7 +1945,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1967,7 +1967,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1982,7 +1982,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2007,7 +2007,7 @@ entry: define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_max_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2020,7 +2020,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_max_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2033,7 +2033,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_max_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2043,7 +2043,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2059,8 +2059,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_max_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2077,8 +2077,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_max_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2095,12 +2095,12 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_max_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2109,8 +2109,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_max_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2130,8 +2130,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2145,8 +2145,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_max_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2160,12 +2160,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_max_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -2175,8 +2175,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_max_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2196,7 +2196,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2216,7 +2216,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2236,7 +2236,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2251,7 +2251,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2275,7 +2275,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umax_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2286,7 +2286,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umax_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2297,7 +2297,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umax_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2307,7 +2307,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2324,8 +2324,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umax_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2342,8 +2342,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2360,12 +2360,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2374,8 +2374,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_umax_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2396,8 +2396,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2411,8 +2411,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2428,12 +2428,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -2443,8 +2443,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_umax_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2465,7 +2465,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2485,7 +2485,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2507,7 +2507,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2522,7 +2522,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2547,7 +2547,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umax_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2560,7 +2560,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_umax_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2573,7 +2573,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_umax_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2583,7 +2583,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2599,8 +2599,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umax_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2617,8 +2617,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umax_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2635,12 +2635,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umax_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2649,8 +2649,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: atomic_umax_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2670,8 +2670,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2685,8 +2685,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umax_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2700,12 +2700,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umax_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -2715,8 +2715,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_umax_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2736,7 +2736,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2756,7 +2756,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2776,7 +2776,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2791,7 +2791,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2815,7 +2815,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_min_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2826,7 +2826,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_min_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2837,7 +2837,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_min_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2847,7 +2847,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2864,8 +2864,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_min_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2882,8 +2882,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2900,12 +2900,12 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2914,8 +2914,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_min_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2936,8 +2936,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2951,8 +2951,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2968,12 +2968,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -2983,8 +2983,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_min_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3005,7 +3005,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3025,7 +3025,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3047,7 +3047,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3062,7 +3062,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3087,7 +3087,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_min_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3100,7 +3100,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_min_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3113,7 +3113,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3123,7 +3123,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3139,8 +3139,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_min_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3157,8 +3157,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_min_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3175,12 +3175,12 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_min_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -3189,8 +3189,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_min_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3210,8 +3210,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3225,8 +3225,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_min_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3240,12 +3240,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_min_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -3255,8 +3255,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_min_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3276,7 +3276,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3296,7 +3296,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3316,7 +3316,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3331,7 +3331,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3355,7 +3355,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umin_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3366,7 +3366,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umin_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3377,7 +3377,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umin_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3387,7 +3387,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3404,8 +3404,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umin_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3422,8 +3422,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3440,12 +3440,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -3454,8 +3454,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_umin_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3476,8 +3476,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3491,8 +3491,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umin_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3508,12 +3508,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umin_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -3523,8 +3523,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_umin_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3545,7 +3545,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3565,7 +3565,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umin_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3587,7 +3587,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3602,7 +3602,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3627,7 +3627,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umin_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3640,7 +3640,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_umin_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3653,7 +3653,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_umin_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3663,7 +3663,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3679,8 +3679,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umin_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3697,8 +3697,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umin_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3715,12 +3715,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umin_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -3729,8 +3729,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: atomic_umin_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3750,8 +3750,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3765,8 +3765,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umin_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3780,12 +3780,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umin_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -3795,8 +3795,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_umin_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3816,7 +3816,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3836,7 +3836,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3871,7 +3871,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3895,7 +3895,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_or_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3908,7 +3908,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_or_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3921,7 +3921,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_or_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3933,7 +3933,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3950,8 +3950,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_or_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3969,8 +3969,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3988,12 +3988,12 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4003,8 +4003,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; GFX12-LABEL: atomic_or_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4025,8 +4025,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4042,8 +4042,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; ; VI-LABEL: atomic_or_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4061,12 +4061,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; ; GFX9-LABEL: atomic_or_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -4078,8 +4078,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; GFX12-LABEL: atomic_or_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4100,7 +4100,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4121,7 +4121,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; VI-LABEL: atomic_or_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4144,7 +4144,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX9-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -4160,7 +4160,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4185,7 +4185,7 @@ entry: define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_or_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4200,7 +4200,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_or_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4215,7 +4215,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_or_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4227,7 +4227,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4243,8 +4243,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_or_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4262,8 +4262,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: atomic_or_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4281,12 +4281,12 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: atomic_or_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4296,8 +4296,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; GFX12-LABEL: atomic_or_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4317,8 +4317,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4334,8 +4334,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_or_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4351,12 +4351,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_or_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -4368,8 +4368,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_or_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4389,7 +4389,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4410,7 +4410,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4431,7 +4431,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -4447,7 +4447,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4471,7 +4471,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xchg_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4484,7 +4484,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_xchg_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4497,7 +4497,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_xchg_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4509,7 +4509,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4526,7 +4526,7 @@ entry: define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double %in) { ; CI-LABEL: atomic_xchg_f64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4539,7 +4539,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; VI-LABEL: atomic_xchg_f64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4552,7 +4552,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; GFX9-LABEL: atomic_xchg_f64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4564,7 +4564,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4581,7 +4581,7 @@ entry: define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr %in) { ; CI-LABEL: atomic_xchg_pointer_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4594,7 +4594,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_pointer_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4607,7 +4607,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_pointer_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4619,7 +4619,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4636,8 +4636,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xchg_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4655,8 +4655,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4674,12 +4674,12 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4689,8 +4689,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_xchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4711,8 +4711,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4728,8 +4728,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_xchg_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4747,12 +4747,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_xchg_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -4764,8 +4764,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4786,7 +4786,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4807,7 +4807,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_xchg_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4830,7 +4830,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -4846,7 +4846,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4871,7 +4871,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xchg_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4886,7 +4886,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_xchg_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4901,7 +4901,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_xchg_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4913,7 +4913,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4929,8 +4929,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xchg_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4948,8 +4948,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_xchg_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4967,12 +4967,12 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_xchg_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4982,8 +4982,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: atomic_xchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5003,8 +5003,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5020,8 +5020,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_xchg_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5037,12 +5037,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_xchg_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -5054,8 +5054,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_xchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5075,7 +5075,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5096,7 +5096,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5117,7 +5117,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -5133,7 +5133,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5157,7 +5157,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xor_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -5170,7 +5170,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_xor_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5183,7 +5183,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_xor_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -5195,7 +5195,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -5212,8 +5212,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xor_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5231,8 +5231,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5250,12 +5250,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5265,8 +5265,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_xor_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5287,8 +5287,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5304,8 +5304,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_xor_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5323,12 +5323,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_xor_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -5340,8 +5340,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_xor_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5362,7 +5362,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5383,7 +5383,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_xor_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5406,7 +5406,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -5422,7 +5422,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5447,7 +5447,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xor_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5462,7 +5462,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_xor_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5477,7 +5477,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_xor_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -5489,7 +5489,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -5505,8 +5505,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xor_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5524,8 +5524,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_xor_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5543,12 +5543,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_xor_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5558,8 +5558,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_xor_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5579,8 +5579,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5596,8 +5596,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_xor_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5613,12 +5613,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_xor_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -5630,8 +5630,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_xor_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5651,7 +5651,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5672,7 +5672,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5693,7 +5693,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -5709,7 +5709,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5733,50 +5733,50 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 +; CI-NEXT: s_mov_b32 s0, s4 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 +; CI-NEXT: v_mov_b32_e32 v2, s8 +; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5785,8 +5785,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 ; GFX12-LABEL: atomic_cmpxchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 @@ -5804,52 +5804,52 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_soffset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, 0x11940 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 +; CI-NEXT: s_mov_b32 s0, s4 +; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, 0x11940 ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], s2 +; CI-NEXT: v_mov_b32_e32 v2, s8 +; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s4 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, 0x11940 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, 0x11940 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x11000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] offset:2368 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5858,8 +5858,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 ; GFX12-LABEL: atomic_cmpxchg_i64_soffset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 @@ -5877,7 +5877,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5897,7 +5897,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5917,7 +5917,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -5932,7 +5932,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -5955,7 +5955,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5974,7 +5974,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_cmpxchg_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 @@ -5994,7 +5994,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6011,7 +6011,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 @@ -6033,32 +6033,32 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; CI-NEXT: s_mov_b32 s15, 0xf000 -; CI-NEXT: s_mov_b32 s14, -1 +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: s_mov_b32 s12, s6 -; CI-NEXT: s_mov_b32 s13, s7 +; CI-NEXT: s_lshl_b64 s[10:11], s[10:11], 3 +; CI-NEXT: v_mov_b32_e32 v4, s10 +; CI-NEXT: s_mov_b32 s0, s6 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s15 +; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v2, s12 +; CI-NEXT: v_mov_b32_e32 v3, s13 +; CI-NEXT: v_mov_b32_e32 v5, s11 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -6083,9 +6083,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX9-NEXT: s_add_u32 s2, s4, s2 @@ -6103,8 +6103,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 @@ -6131,50 +6131,50 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 +; CI-NEXT: s_mov_b32 s0, s4 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 +; CI-NEXT: v_mov_b32_e32 v2, s8 +; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6183,8 +6183,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6 ; GFX12-LABEL: atomic_cmpxchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 @@ -6201,7 +6201,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6221,7 +6221,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: atomic_cmpxchg_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6241,7 +6241,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -6256,7 +6256,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -6278,7 +6278,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6297,7 +6297,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; VI-LABEL: atomic_cmpxchg_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 @@ -6315,7 +6315,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; GFX9-LABEL: atomic_cmpxchg_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6332,7 +6332,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 @@ -6353,32 +6353,32 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; CI-NEXT: s_mov_b32 s15, 0xf000 -; CI-NEXT: s_mov_b32 s14, -1 +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: s_mov_b32 s12, s6 -; CI-NEXT: s_mov_b32 s13, s7 +; CI-NEXT: s_lshl_b64 s[10:11], s[10:11], 3 +; CI-NEXT: v_mov_b32_e32 v4, s10 +; CI-NEXT: s_mov_b32 s0, s6 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s15 +; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v2, s12 +; CI-NEXT: v_mov_b32_e32 v3, s13 +; CI-NEXT: v_mov_b32_e32 v5, s11 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; VI-NEXT: s_add_u32 s2, s4, s2 @@ -6401,9 +6401,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX9-NEXT: s_add_u32 s2, s4, s2 @@ -6421,8 +6421,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 @@ -6448,7 +6448,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6464,7 +6464,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6482,7 +6482,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc @@ -6493,7 +6493,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT @@ -6513,7 +6513,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64_neg_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: v_not_b32_e32 v0, 31 ; CI-NEXT: v_mov_b32_e32 v1, -1 @@ -6531,7 +6531,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; VI-LABEL: atomic_load_i64_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6549,7 +6549,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_i64_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:-32 glc @@ -6560,7 +6560,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; GFX12-LABEL: atomic_load_i64_neg_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT @@ -6580,7 +6580,7 @@ entry: define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6596,7 +6596,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; VI-LABEL: atomic_load_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6612,7 +6612,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc @@ -6623,7 +6623,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT @@ -6642,8 +6642,8 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6662,8 +6662,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6684,11 +6684,11 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc @@ -6700,8 +6700,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-LABEL: atomic_load_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6725,28 +6725,28 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_mov_b32 s9, s7 +; CI-NEXT: s_mov_b32 s0, s6 +; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_load_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6765,11 +6765,11 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc @@ -6781,8 +6781,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; GFX12-LABEL: atomic_load_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6805,8 +6805,8 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_f64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6825,8 +6825,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_f64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6847,11 +6847,11 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_f64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc @@ -6863,8 +6863,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-LABEL: atomic_load_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6888,7 +6888,7 @@ entry: define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_store_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6901,7 +6901,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s2, 32 @@ -6914,7 +6914,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -6924,7 +6924,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -6941,7 +6941,7 @@ entry: define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_store_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6954,7 +6954,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -6965,7 +6965,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; GFX9-LABEL: atomic_store_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -6975,7 +6975,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -6991,8 +6991,8 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7007,8 +7007,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; ; VI-LABEL: atomic_store_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7024,12 +7024,12 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; ; GFX9-LABEL: atomic_store_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -7039,8 +7039,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; GFX12-LABEL: atomic_store_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -7061,24 +7061,24 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[8:11], 0 addr64 +; CI-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7092,12 +7092,12 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -7107,8 +7107,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; GFX12-LABEL: atomic_store_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -7128,8 +7128,8 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_f64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7144,8 +7144,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; ; VI-LABEL: atomic_store_f64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7161,12 +7161,12 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; ; GFX9-LABEL: atomic_store_f64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -7176,8 +7176,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; GFX12-LABEL: atomic_store_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -7198,7 +7198,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_inc_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -7211,7 +7211,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_inc_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -7224,7 +7224,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_inc_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -7236,7 +7236,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -7253,8 +7253,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_inc_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7272,8 +7272,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_inc_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7291,12 +7291,12 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_inc_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7306,8 +7306,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_inc_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -7328,8 +7328,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_inc_i64_incr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7345,8 +7345,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_inc_i64_incr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7364,12 +7364,12 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_inc_i64_incr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -7381,8 +7381,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_inc_i64_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -7403,7 +7403,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_dec_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -7416,7 +7416,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_dec_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -7429,7 +7429,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_dec_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -7441,7 +7441,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -7458,8 +7458,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_dec_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7477,8 +7477,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7496,12 +7496,12 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_dec_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7511,8 +7511,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_dec_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -7533,8 +7533,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_dec_i64_decr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7550,8 +7550,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_dec_i64_decr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7569,12 +7569,12 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_dec_i64_decr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -7586,8 +7586,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_dec_i64_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 8897ad3e950a58..cafd35afea6ebd 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -4866,8 +4866,8 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -4905,8 +4905,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -4941,15 +4941,15 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4981,7 +4981,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -5025,7 +5025,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -5064,7 +5064,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -5107,8 +5107,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -5146,8 +5146,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_max_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s4, s0, s4 @@ -5180,15 +5180,15 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_max_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5219,7 +5219,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -5300,7 +5300,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6328,8 +6328,8 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -6367,8 +6367,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6403,15 +6403,15 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6443,7 +6443,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -6487,7 +6487,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6526,7 +6526,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6569,7 +6569,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -6613,7 +6613,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -6650,7 +6650,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8664,8 +8664,8 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -8703,8 +8703,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8739,15 +8739,15 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8779,7 +8779,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -8823,7 +8823,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8862,7 +8862,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8905,7 +8905,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: atomic_min_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SI-NEXT: s_mov_b64 s[8:9], 0 @@ -8942,7 +8942,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_min_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 @@ -8972,7 +8972,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9006,7 +9006,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -9050,7 +9050,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -9087,7 +9087,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index ab32efc4d3cd8e..04df04a5c299b3 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -18,15 +18,15 @@ declare double @div.double.value() define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,23 +54,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -118,22 +118,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -141,30 +141,30 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[2:3] +; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -172,20 +172,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[2:3] +; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -193,23 +193,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -225,23 +225,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -257,22 +257,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -280,30 +280,30 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] +; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB0_2: ; GFX1164-DPP-NEXT: s_nop 0 ; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -311,20 +311,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] +; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB0_2: ; GFX1132-DPP-NEXT: s_nop 0 ; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1054,24 +1054,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ret void } -define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1101,27 +1101,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1143,33 +1143,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -1183,31 +1183,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1216,19 +1216,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -1240,7 +1240,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1265,14 +1265,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: .LBB2_3: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -1282,14 +1282,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1298,34 +1298,34 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1347,33 +1347,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -1387,31 +1387,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1420,19 +1420,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1444,7 +1444,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1469,14 +1469,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: .LBB2_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1486,14 +1486,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1502,9 +1502,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -1513,8 +1513,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope } -define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -1564,7 +1564,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1630,7 +1630,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1696,7 +1696,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1761,7 +1761,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 @@ -1822,7 +1822,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: .LBB3_5: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 @@ -1882,7 +1882,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1968,7 +1968,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -2050,7 +2050,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -2126,7 +2126,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 @@ -2208,7 +2208,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: .LBB3_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 @@ -2289,21 +2289,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2335,25 +2335,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2377,31 +2377,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2417,29 +2417,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2448,19 +2448,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -2472,7 +2472,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2499,12 +2499,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -2514,14 +2514,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2530,34 +2530,34 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2581,31 +2581,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2621,29 +2621,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2652,19 +2652,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -2676,7 +2676,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2703,12 +2703,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -2718,14 +2718,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2734,9 +2734,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -3467,8 +3467,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ } -define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -3518,7 +3518,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3584,7 +3584,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB6_5: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3650,7 +3650,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_5: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3715,7 +3715,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: .LBB6_5: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 @@ -3763,7 +3763,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: .LBB6_4: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 @@ -3810,7 +3810,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB6_4: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3896,7 +3896,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3978,7 +3978,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -4054,7 +4054,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 @@ -4123,7 +4123,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: .LBB6_2: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 @@ -4191,21 +4191,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -4237,25 +4237,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4279,31 +4279,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4319,29 +4319,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4350,19 +4350,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -4374,7 +4374,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4401,12 +4401,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -4416,14 +4416,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4432,34 +4432,34 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4483,31 +4483,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4523,29 +4523,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4554,19 +4554,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -4578,7 +4578,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4605,12 +4605,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -4620,14 +4620,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4636,9 +4636,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -5422,1708 +5422,891 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 -; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 -; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX7LESS-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v4, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_waitcnt expcnt(2) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 -; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[0:1], v[3:4], v[41:42] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s6, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 -; GFX1132-NEXT: .LBB9_3: -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-NEXT: .LBB9_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 -; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[3:4], v[41:42] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s6, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8 ret void } define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_waitcnt expcnt(2) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() - %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 ret void } -define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -7156,27 +6339,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: .LBB11_3: ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -7199,25 +6382,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-NEXT: .LBB11_3: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -7240,32 +6423,32 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-NEXT: .LBB11_3: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7274,19 +6457,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -7298,7 +6481,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -7324,14 +6507,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-NEXT: .LBB11_3: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -7341,14 +6524,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7357,34 +6540,34 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7407,25 +6590,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -7448,32 +6631,32 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -7482,19 +6665,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -7506,7 +6689,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -7532,14 +6715,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -7549,14 +6732,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7565,9 +6748,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -7575,8 +6758,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ret void } -define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -7629,7 +6812,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7675,7 +6858,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7721,7 +6904,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7767,7 +6950,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b32 s14, s8 ; GFX1164-NEXT: s_add_u32 s8, s2, 44 @@ -7804,7 +6987,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-NEXT: s_addc_u32 s9, s3, 0 @@ -7839,7 +7022,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7885,7 +7068,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7931,7 +7114,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7977,7 +7160,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 @@ -8014,7 +7197,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 @@ -8060,17 +7243,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -8105,25 +7288,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8148,23 +7331,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8189,30 +7372,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -8221,19 +7404,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -8245,7 +7428,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8273,12 +7456,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -8288,14 +7471,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8304,34 +7487,34 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8356,23 +7539,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8397,30 +7580,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -8429,19 +7612,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -8453,7 +7636,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8481,12 +7664,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -8496,14 +7679,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8512,9 +7695,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -9000,8 +8183,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ret void } -define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -9054,7 +8237,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9100,7 +8283,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9146,7 +8329,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9192,7 +8375,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b32 s14, s8 ; GFX1164-NEXT: s_add_u32 s8, s2, 44 @@ -9229,7 +8412,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-NEXT: s_addc_u32 s9, s3, 0 @@ -9264,7 +8447,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9310,7 +8493,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9356,7 +8539,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9402,7 +8585,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 @@ -9439,7 +8622,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 @@ -9481,1732 +8664,962 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0xc3300000 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 -; GFX7LESS-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_waitcnt expcnt(2) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20 -; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16 -; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 -; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 -; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 -; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 -; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 monotonic, align 8 ret void } define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_waitcnt expcnt(2) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp - %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 8 ret void } define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB18_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -11234,23 +9647,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB18_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11266,23 +9679,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB18_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11298,22 +9711,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB18_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11321,33 +9734,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1032-NEXT: .LBB18_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB18_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11365,23 +9778,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB18_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11390,32 +9803,32 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1132-NEXT: .LBB18_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11431,23 +9844,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11463,22 +9876,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11486,33 +9899,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1032-DPP-NEXT: .LBB18_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11530,23 +9943,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11555,9 +9968,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1132-DPP-NEXT: .LBB18_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -11568,15 +9981,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB19_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -11604,23 +10017,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB19_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11636,23 +10049,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB19_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11668,22 +10081,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB19_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11691,33 +10104,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1032-NEXT: .LBB19_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB19_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11735,23 +10148,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB19_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11760,32 +10173,32 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1132-NEXT: .LBB19_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11801,23 +10214,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11833,22 +10246,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -11856,33 +10269,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1032-DPP-NEXT: .LBB19_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11900,23 +10313,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11925,9 +10338,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1132-DPP-NEXT: .LBB19_3: ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index a13e704a1a5fc8..005cd3a0021b39 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -21,10 +21,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,10 +54,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -83,10 +83,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -98,10 +98,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -112,13 +112,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -131,12 +131,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -150,10 +150,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -179,10 +179,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -194,10 +194,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -208,13 +208,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -227,12 +227,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -976,10 +976,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -1009,10 +1009,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1038,10 +1038,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1065,17 +1065,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1084,8 +1084,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm @@ -1093,13 +1093,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1125,18 +1125,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1146,9 +1146,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm @@ -1158,10 +1158,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1187,10 +1187,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1214,17 +1214,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1233,8 +1233,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -1242,13 +1242,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1274,18 +1274,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1295,9 +1295,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -2158,10 +2158,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -2191,10 +2191,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2220,10 +2220,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2247,17 +2247,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2266,8 +2266,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm @@ -2275,13 +2275,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2307,18 +2307,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2328,9 +2328,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm @@ -2340,10 +2340,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2369,10 +2369,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2396,17 +2396,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2415,8 +2415,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -2424,13 +2424,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2456,18 +2456,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2477,9 +2477,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -3335,1653 +3335,739 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB6_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 -; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 -; GFX1064-NEXT: .LBB6_3: +; GFX1064-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: .LBB6_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB6_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 -; GFX1032-NEXT: .LBB6_3: +; GFX1032-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1032-NEXT: .LBB6_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 -; GFX1064-DPP-NEXT: .LBB6_3: +; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1064-DPP-NEXT: .LBB6_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 -; GFX1032-DPP-NEXT: .LBB6_3: +; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1] +; GFX1032-DPP-NEXT: .LBB6_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8 ret void } define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v40 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35] ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() - %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 ret void } @@ -4991,10 +4077,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -5028,10 +4114,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5059,10 +4145,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -5088,18 +4174,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5109,8 +4195,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-NEXT: .LBB8_3: ; GFX1032-NEXT: s_endpgm @@ -5118,13 +4204,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -5152,18 +4238,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5173,9 +4259,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm @@ -5185,10 +4271,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5216,10 +4302,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -5245,18 +4331,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5266,8 +4352,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -5275,13 +4361,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -5309,18 +4395,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5330,9 +4416,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -5847,1653 +4933,859 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 8 ret void } define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v40 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() - %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 8 ret void } @@ -7503,10 +5795,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -7536,10 +5828,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -7565,10 +5857,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -7592,17 +5884,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -7611,8 +5903,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-NEXT: .LBB12_3: ; GFX1032-NEXT: s_endpgm @@ -7620,13 +5912,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -7652,18 +5944,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7673,9 +5965,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm @@ -7685,10 +5977,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7714,10 +6006,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -7741,17 +6033,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -7760,8 +6052,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -7769,13 +6061,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -7801,18 +6093,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7822,9 +6114,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -7838,10 +6130,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -7871,10 +6163,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -7900,10 +6192,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -7927,17 +6219,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -7946,8 +6238,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm @@ -7955,13 +6247,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -7987,18 +6279,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8008,9 +6300,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm @@ -8020,10 +6312,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8049,10 +6341,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -8076,17 +6368,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -8095,8 +6387,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8104,13 +6396,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -8136,18 +6428,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8157,9 +6449,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 65d0b9eafdf820..3f4779f08e42fe 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -21,10 +21,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,10 +54,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -83,10 +83,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -98,10 +98,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -112,13 +112,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -131,12 +131,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -150,10 +150,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -179,10 +179,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -194,10 +194,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -208,13 +208,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -227,12 +227,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -976,10 +976,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -1009,10 +1009,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1038,10 +1038,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1065,17 +1065,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1084,8 +1084,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm @@ -1093,13 +1093,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1125,18 +1125,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1146,9 +1146,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm @@ -1158,10 +1158,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1187,10 +1187,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1214,17 +1214,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1233,8 +1233,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -1242,13 +1242,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1274,18 +1274,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1295,9 +1295,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -2158,10 +2158,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -2191,10 +2191,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2220,10 +2220,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2247,17 +2247,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2266,8 +2266,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm @@ -2275,13 +2275,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2307,18 +2307,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2328,9 +2328,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm @@ -2340,10 +2340,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2369,10 +2369,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2396,17 +2396,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2415,8 +2415,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -2424,13 +2424,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2456,18 +2456,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2477,9 +2477,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -3335,1653 +3335,739 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB6_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 -; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 -; GFX1064-NEXT: .LBB6_3: +; GFX1064-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: .LBB6_2: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB6_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 -; GFX1032-NEXT: .LBB6_3: +; GFX1032-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1032-NEXT: .LBB6_2: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 -; GFX1064-DPP-NEXT: .LBB6_3: +; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1064-DPP-NEXT: .LBB6_2: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 -; GFX1032-DPP-NEXT: .LBB6_3: +; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1] +; GFX1032-DPP-NEXT: .LBB6_2: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8 ret void } define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v40 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35] ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() - %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 ret void } @@ -4991,10 +4077,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -5028,10 +4114,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5059,10 +4145,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -5088,18 +4174,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5109,8 +4195,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-NEXT: .LBB8_3: ; GFX1032-NEXT: s_endpgm @@ -5118,13 +4204,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -5152,18 +4238,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5173,9 +4259,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm @@ -5185,10 +4271,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5216,10 +4302,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -5245,18 +4331,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5266,8 +4352,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -5275,13 +4361,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -5309,18 +4395,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5330,9 +4416,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -5847,1653 +4933,859 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 8 ret void } define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v40 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() - %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 8 ret void } @@ -7503,10 +5795,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -7536,10 +5828,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -7565,10 +5857,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -7592,17 +5884,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -7611,8 +5903,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-NEXT: .LBB12_3: ; GFX1032-NEXT: s_endpgm @@ -7620,13 +5912,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -7652,18 +5944,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7673,9 +5965,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm @@ -7685,10 +5977,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7714,10 +6006,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -7741,17 +6033,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -7760,8 +6052,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -7769,13 +6061,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -7801,18 +6093,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7822,9 +6114,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -7838,10 +6130,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -7871,10 +6163,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -7900,10 +6192,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -7927,17 +6219,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -7946,8 +6238,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm @@ -7955,13 +6247,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -7987,18 +6279,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8008,9 +6300,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm @@ -8020,10 +6312,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8049,10 +6341,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -8076,17 +6368,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -8095,8 +6387,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8104,13 +6396,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -8136,18 +6428,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8157,9 +6449,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 2bba8d4f43b1a8..64650e2733a00d 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -18,15 +18,15 @@ declare double @div.double.value() define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,23 +54,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -118,22 +118,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -141,33 +141,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -185,23 +185,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -210,32 +210,32 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -251,23 +251,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -283,22 +283,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -306,33 +306,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -350,23 +350,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -375,9 +375,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1132-DPP-NEXT: .LBB0_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -1158,24 +1158,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ret void } -define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1205,27 +1205,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1247,33 +1247,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1287,31 +1287,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1320,19 +1320,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -1344,7 +1344,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1369,14 +1369,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: .LBB2_3: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -1386,14 +1386,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1402,34 +1402,34 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1451,33 +1451,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1491,31 +1491,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1524,19 +1524,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1548,7 +1548,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1573,14 +1573,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: .LBB2_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1590,14 +1590,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1606,9 +1606,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -1617,8 +1617,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope } -define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -1668,7 +1668,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1734,7 +1734,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1800,7 +1800,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -1865,7 +1865,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 @@ -1926,7 +1926,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: .LBB3_5: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 @@ -1986,7 +1986,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -2072,7 +2072,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -2154,7 +2154,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -2230,7 +2230,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 @@ -2312,7 +2312,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: .LBB3_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 @@ -2393,21 +2393,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2439,25 +2439,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2481,31 +2481,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2521,29 +2521,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2552,19 +2552,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -2576,7 +2576,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2603,12 +2603,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -2618,14 +2618,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2634,34 +2634,34 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2685,31 +2685,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2725,29 +2725,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2756,19 +2756,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -2780,7 +2780,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2807,12 +2807,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -2822,14 +2822,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2838,9 +2838,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -3623,8 +3623,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ } -define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -3674,7 +3674,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3740,7 +3740,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB6_5: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3806,7 +3806,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_5: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -3871,7 +3871,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: .LBB6_5: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 @@ -3932,7 +3932,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: .LBB6_5: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 @@ -3992,7 +3992,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB6_5: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -4078,7 +4078,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -4160,7 +4160,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -4236,7 +4236,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 @@ -4318,7 +4318,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 @@ -4399,21 +4399,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -4445,25 +4445,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4487,31 +4487,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4527,29 +4527,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4558,19 +4558,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -4582,7 +4582,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4609,12 +4609,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -4624,14 +4624,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4640,34 +4640,34 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4691,31 +4691,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4731,29 +4731,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4762,19 +4762,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -4786,7 +4786,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4813,12 +4813,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -4828,14 +4828,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4844,9 +4844,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -5630,1708 +5630,891 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 -; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 -; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX7LESS-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v4, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_waitcnt expcnt(2) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 -; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[0:1], v[3:4], -v[41:42] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s6, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-NEXT: .LBB9_3: -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 -; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[3:4], -v[41:42] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s6, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8 ret void } define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_waitcnt expcnt(2) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() - %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8 ret void } -define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -7364,27 +6547,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: .LBB11_3: ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -7407,25 +6590,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-NEXT: .LBB11_3: ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -7448,32 +6631,32 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-NEXT: .LBB11_3: ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -7482,19 +6665,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -7506,7 +6689,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -7532,14 +6715,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-NEXT: .LBB11_3: ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -7549,14 +6732,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7565,34 +6748,34 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7615,25 +6798,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -7656,32 +6839,32 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -7690,19 +6873,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -7714,7 +6897,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -7740,14 +6923,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -7757,14 +6940,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7773,17 +6956,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic ret void } -define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -7836,7 +7019,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7882,7 +7065,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7928,7 +7111,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -7974,7 +7157,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b32 s14, s8 ; GFX1164-NEXT: s_add_u32 s8, s2, 44 @@ -8011,7 +7194,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-NEXT: s_addc_u32 s9, s3, 0 @@ -8046,7 +7229,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8092,7 +7275,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8138,7 +7321,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -8184,7 +7367,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 @@ -8221,7 +7404,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 @@ -8267,17 +7450,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -8312,25 +7495,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8355,23 +7538,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8396,30 +7579,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8428,19 +7611,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -8452,7 +7635,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8480,12 +7663,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -8495,14 +7678,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8511,34 +7694,34 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8563,23 +7746,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8604,30 +7787,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8636,19 +7819,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -8660,7 +7843,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8688,12 +7871,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -8703,14 +7886,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8719,9 +7902,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -9207,8 +8390,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ret void } -define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { -; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 @@ -9261,7 +8444,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; -; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9307,7 +8490,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9353,7 +8536,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9399,7 +8582,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b32 s14, s8 ; GFX1164-NEXT: s_add_u32 s8, s2, 44 @@ -9436,7 +8619,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-NEXT: s_addc_u32 s9, s3, 0 @@ -9471,7 +8654,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; -; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9517,7 +8700,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; -; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9563,7 +8746,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; -; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 @@ -9609,7 +8792,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; -; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 @@ -9646,7 +8829,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; -; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_strictfp: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 @@ -9687,1717 +8870,947 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0xc3300000 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 -; GFX7LESS-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_waitcnt expcnt(2) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20 -; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16 -; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 -; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 -; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 -; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 -; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 -; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 -; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 monotonic, align 8 ret void } define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 -; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[0:1] -; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7LESS-NEXT: s_waitcnt expcnt(2) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1064-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[0:1] -; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1032-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[0:1] -; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1164-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[0:1] -; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 -; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1132-NEXT: s_mov_b32 s44, 0 -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1132-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[0:1] -; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 ; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off -; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp - %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll index e41634402c0c2b..3b71e8ffefbf8c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll @@ -71,7 +71,7 @@ bb: ; uniform load dominated by no-alias store - scalarize ; CHECK-LABEL: @no_memdep_alias_arg -; CHECK: s_load_dwordx2 s[[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[6:7], 0x0 +; CHECK: s_load_dwordx2 s[[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[4:5], 0x0 ; CHECK: s_load_dword [[SVAL:s[0-9]+]], s[[[IN_LO]]:[[IN_HI]]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] @@ -100,7 +100,7 @@ define amdgpu_kernel void @memdep(ptr addrspace(1) %in, [8 x i32], ptr addrspace ; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]] ; CHECK-DAG: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0 ; CHECK-DAG: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 -; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[6:7], 0x0 +; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0 ; CHECK-DAG: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 3735c6349fbb31..e2d55990473c09 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: load_f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -19,8 +19,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; ; VI-LABEL: load_f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -31,10 +31,10 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; GFX11-LABEL: load_f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -46,8 +46,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: load_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -57,8 +57,8 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; ; VI-LABEL: load_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -69,10 +69,10 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; GFX11-LABEL: load_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -84,7 +84,7 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CIVI-LABEL: load_v3f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 @@ -100,7 +100,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; ; GFX11-LABEL: load_v3f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 @@ -119,7 +119,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CIVI-LABEL: load_v4f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 @@ -130,7 +130,7 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -145,12 +145,12 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: load_v8f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v4, s4 +; CI-NEXT: v_mov_b32_e32 v4, s6 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_mov_b32_e32 v5, s7 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -159,12 +159,12 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; ; VI-LABEL: load_v8f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -174,8 +174,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; GFX11-LABEL: load_v8f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -191,8 +191,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: extload_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -204,8 +204,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; ; VI-LABEL: extload_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -218,13 +218,13 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; GFX11-LABEL: extload_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -237,8 +237,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -248,8 +248,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -260,11 +260,11 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; GFX11-LABEL: extload_f16_to_f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -277,8 +277,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -290,8 +290,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -304,13 +304,13 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; GFX11-LABEL: extload_v2f16_to_v2f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -323,7 +323,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 @@ -336,7 +336,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -349,7 +349,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -368,7 +368,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: s_lshr_b32 s5, s2, 16 @@ -383,7 +383,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s3, 16 @@ -419,8 +419,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 ; CI-NEXT: s_lshr_b32 s7, s0, 16 @@ -447,8 +447,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 ; VI-NEXT: s_lshr_b32 s7, s0, 16 @@ -476,8 +476,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; GFX11-LABEL: extload_v8f16_to_v8f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s8, s7, 16 @@ -506,10 +506,10 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -519,10 +519,10 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x8 +; VI-NEXT: s_load_dword s0, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -532,14 +532,14 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; GFX11-LABEL: extload_f16_to_f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -552,12 +552,12 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -568,12 +568,12 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x8 +; VI-NEXT: s_load_dword s0, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -584,17 +584,17 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; GFX11-LABEL: extload_v2f16_to_v2f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -607,7 +607,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 @@ -628,7 +628,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 @@ -649,7 +649,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -675,7 +675,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -725,7 +725,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s5, s3, 16 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -754,8 +754,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 @@ -801,8 +801,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 @@ -848,20 +848,22 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; GFX11-LABEL: extload_v8f16_to_v8f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s9, s7, 16 ; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s1, s5, 16 +; GFX11-NEXT: s_lshr_b32 s3, s5, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s7 ; GFX11-NEXT: v_cvt_f32_f16_e32 v11, s9 -; GFX11-NEXT: s_lshr_b32 s0, s4, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s6 ; GFX11-NEXT: v_cvt_f32_f16_e32 v10, s8 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s2 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v6 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v11 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 @@ -870,9 +872,7 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 @@ -889,7 +889,7 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -902,7 +902,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: global_load_store_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -919,7 +919,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v2f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -932,7 +932,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -949,7 +949,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: global_load_store_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -962,7 +962,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add ; ; GFX11-LABEL: global_load_store_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -979,7 +979,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v8f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -992,7 +992,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f32: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1023,7 +1023,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1042,7 +1042,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v2, s[2:3] @@ -1095,7 +1095,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1128,7 +1128,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v3, s[2:3] @@ -1151,7 +1151,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1187,7 +1187,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] @@ -1212,7 +1212,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1268,7 +1268,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] @@ -1300,7 +1300,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -1358,7 +1358,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v20, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f64: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1493,7 +1493,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1528,7 +1528,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v4, s[2:3] @@ -1553,7 +1553,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1602,7 +1602,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] @@ -1631,7 +1631,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] @@ -1718,7 +1718,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1766,7 +1766,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1810,7 +1810,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1947,7 +1947,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2033,7 +2033,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2102,7 +2102,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_truncstore_f32_to_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2116,7 +2116,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p ; ; GFX11-LABEL: global_truncstore_f32_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2135,7 +2135,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v2f32_to_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2152,7 +2152,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v2f32_to_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2168,7 +2168,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v2f32_to_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -2190,7 +2190,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v3f32_to_v3f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2213,7 +2213,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v3f32_to_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2235,7 +2235,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v3f32_to_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b96 v[0:2], v3, s[2:3] @@ -2260,7 +2260,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v4f32_to_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2281,7 +2281,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v4f32_to_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2300,7 +2300,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v4f32_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2325,7 +2325,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v8f32_to_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2360,7 +2360,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v8f32_to_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2391,7 +2391,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v8f32_to_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2425,7 +2425,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v16f32_to_v16f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -2494,7 +2494,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; VI-LABEL: global_truncstore_v16f32_to_v16f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -2554,7 +2554,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; GFX11-LABEL: global_truncstore_v16f32_to_v16f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 @@ -2606,12 +2606,12 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 { ; CI-LABEL: fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2622,8 +2622,8 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2636,13 +2636,13 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; GFX11-LABEL: fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_add_f16_e64 v1, s4, s2 +; GFX11-NEXT: v_add_f16_e64 v1, s2, s3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2655,7 +2655,7 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x half> %b) #0 { ; CI-LABEL: fadd_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2676,7 +2676,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -2693,7 +2693,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; GFX11-LABEL: fadd_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, s2, s3 @@ -2709,7 +2709,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: fadd_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2746,7 +2746,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: fadd_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2765,7 +2765,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX11-LABEL: fadd_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2787,8 +2787,8 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 { ; CI-LABEL: fadd_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s2, s8, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2845,8 +2845,8 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; ; VI-LABEL: fadd_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s15, 16 ; VI-NEXT: s_lshr_b32 s3, s11, 16 @@ -2888,8 +2888,8 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; GFX11-LABEL: fadd_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, s7, s11 @@ -2908,7 +2908,7 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: test_bitcast_from_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -2921,7 +2921,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr ; ; GFX11-LABEL: test_bitcast_from_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2939,7 +2939,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: test_bitcast_to_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2952,7 +2952,7 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_bitcast_to_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll index 380a8e911e4995..b6eff8846dc8c7 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll @@ -98,4 +98,4 @@ bb: ret void } -attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll index 10c5ffd0eb07e6..6a49eac134a67b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll @@ -1,6 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.bc %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll index 677584caa8b2e6..6f4c8911efd33b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll @@ -1,6 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.bc %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll index 1a5a7698e2f96d..01f8fbfd76314a 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll @@ -1,6 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.bc %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index 8c017fa5ec2636..7a9f4ae8a20fae 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -15,7 +15,7 @@ ; CHECK: .max_flat_workgroup_size: 1024 ; CHECK: .name: test ; CHECK: .private_segment_fixed_size: 0 -; CHECK: .sgpr_count: 10 +; CHECK: .sgpr_count: 6 ; CHECK: .symbol: test.kd ; CHECK: .vgpr_count: {{3|6}} ; WAVE64: .wavefront_size: 64 @@ -23,7 +23,7 @@ define amdgpu_kernel void @test( ptr addrspace(1) %r, ptr addrspace(1) %a, - ptr addrspace(1) %b) "amdgpu-no-implicitarg-ptr" { + ptr addrspace(1) %b) { entry: %a.val = load half, ptr addrspace(1) %a %b.val = load half, ptr addrspace(1) %b @@ -47,10 +47,10 @@ entry: } ; CHECK: .name: num_spilled_sgprs -; GFX700: .sgpr_spill_count: 10 -; GFX803: .sgpr_spill_count: 10 -; GFX900: .sgpr_spill_count: 62 -; GFX1010: .sgpr_spill_count: 60 +; GFX700: .sgpr_spill_count: 12 +; GFX803: .sgpr_spill_count: 12 +; GFX900: .sgpr_spill_count: 48 +; GFX1010: .sgpr_spill_count: 48 ; CHECK: .symbol: num_spilled_sgprs.kd define amdgpu_kernel void @num_spilled_sgprs( ptr addrspace(1) %out0, ptr addrspace(1) %out1, [8 x i32], @@ -61,37 +61,27 @@ define amdgpu_kernel void @num_spilled_sgprs( ptr addrspace(1) %outa, ptr addrspace(1) %outb, [8 x i32], ptr addrspace(1) %outc, ptr addrspace(1) %outd, [8 x i32], ptr addrspace(1) %oute, ptr addrspace(1) %outf, [8 x i32], - ptr addrspace(1) %outg, ptr addrspace(1) %outh, [8 x i32], - ptr addrspace(1) %outi, ptr addrspace(1) %outj, [8 x i32], - ptr addrspace(1) %outk, ptr addrspace(1) %outl, [8 x i32], - ptr addrspace(1) %outm, ptr addrspace(1) %outn, [8 x i32], i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32], i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32], i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32], - i32 %inc, i32 %ind, i32 %ine, i32 %inf, i32 %ing, i32 %inh, - i32 %ini, i32 %inj, i32 %ink) #0 { + i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 { entry: - store volatile i32 %in0, ptr addrspace(1) %out0 - store volatile i32 %in1, ptr addrspace(1) %out1 - store volatile i32 %in2, ptr addrspace(1) %out2 - store volatile i32 %in3, ptr addrspace(1) %out3 - store volatile i32 %in4, ptr addrspace(1) %out4 - store volatile i32 %in5, ptr addrspace(1) %out5 - store volatile i32 %in6, ptr addrspace(1) %out6 - store volatile i32 %in7, ptr addrspace(1) %out7 - store volatile i32 %in8, ptr addrspace(1) %out8 - store volatile i32 %in9, ptr addrspace(1) %out9 - store volatile i32 %ina, ptr addrspace(1) %outa - store volatile i32 %inb, ptr addrspace(1) %outb - store volatile i32 %inc, ptr addrspace(1) %outc - store volatile i32 %ind, ptr addrspace(1) %outd - store volatile i32 %ine, ptr addrspace(1) %oute - store volatile i32 %inf, ptr addrspace(1) %outf - store volatile i32 %ing, ptr addrspace(1) %outg - store volatile i32 %inh, ptr addrspace(1) %outh - store volatile i32 %ini, ptr addrspace(1) %outi - store volatile i32 %inj, ptr addrspace(1) %outj - store volatile i32 %ink, ptr addrspace(1) %outk + store i32 %in0, ptr addrspace(1) %out0 + store i32 %in1, ptr addrspace(1) %out1 + store i32 %in2, ptr addrspace(1) %out2 + store i32 %in3, ptr addrspace(1) %out3 + store i32 %in4, ptr addrspace(1) %out4 + store i32 %in5, ptr addrspace(1) %out5 + store i32 %in6, ptr addrspace(1) %out6 + store i32 %in7, ptr addrspace(1) %out7 + store i32 %in8, ptr addrspace(1) %out8 + store i32 %in9, ptr addrspace(1) %out9 + store i32 %ina, ptr addrspace(1) %outa + store i32 %inb, ptr addrspace(1) %outb + store i32 %inc, ptr addrspace(1) %outc + store i32 %ind, ptr addrspace(1) %outd + store i32 %ine, ptr addrspace(1) %oute + store i32 %inf, ptr addrspace(1) %outf ret void } @@ -170,7 +160,7 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 { ; CHECK-NEXT: - 1 ; CHECK-NEXT: - 1 -attributes #0 = { "amdgpu-num-sgpr"="20" } +attributes #0 = { "amdgpu-num-sgpr"="14" } attributes #1 = { "amdgpu-num-vgpr"="20" } attributes #2 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll index 0db5f01fc0ccc9..689619227b8d70 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll @@ -1,6 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor %s -o %t.bc -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll index 6eece2c9bf4166..9854977c2f308b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll @@ -1,12 +1,10 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -passes=amdgpu-attributor -o %t.gfx7.bc %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=amdgpu-attributor -o %t.gfx8.bc %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.gfx9.bc %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj < %t.gfx7.bc | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj < %t.gfx8.bc | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.gfx9.bc | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %t.gfx7.bc | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %t.gfx8.bc | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.gfx9.bc | FileCheck --check-prefixes=CHECK,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX9 %s + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=CHECK,GFX9 %s ; On gfx8, the queue ptr is required for this addrspacecast. diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll index acf829c4d3c720..cf26a427aec324 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll @@ -1,6 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=amdgpu-attributor -o %t.bc %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %t.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll index 03242b69beb8c3..7986368e2a3584 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll @@ -31,14 +31,14 @@ ; GFX10: .sgpr_spill_count: 0 ; GFX10: .vgpr_count: 4 ; GFX10: .vgpr_spill_count: 0 -define amdgpu_kernel void @test1(ptr %x) #1 { +define amdgpu_kernel void @test1(ptr %x) { %1 = load volatile float, ptr %x %2 = call float @f(float %1) store volatile float %2, ptr %x ret void } -define internal float @f(float %arg0) #1 { +define internal float @f(float %arg0) #0 { %stack = alloca float, i32 4, align 4, addrspace(5) store volatile float 3.0, ptr addrspace(5) %stack %val = load volatile float, ptr addrspace(5) %stack @@ -135,7 +135,6 @@ define amdgpu_kernel void @test4() { } attributes #0 = { norecurse } -attributes #1 = { norecurse "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index 487e62b6c3495e..de484677bf5e6b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -106,7 +106,7 @@ ; HSA: .Lfunc_end0: ; HSA: .size simple, .Lfunc_end0-simple -define amdgpu_kernel void @simple(ptr addrspace(1) %out) #0 { +define amdgpu_kernel void @simple(ptr addrspace(1) %out) { entry: store i32 0, ptr addrspace(1) %out ret void @@ -114,13 +114,11 @@ entry: ; HSA-LABEL: {{^}}simple_no_kernargs: ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -define amdgpu_kernel void @simple_no_kernargs() #0 { +define amdgpu_kernel void @simple_no_kernargs() { entry: store volatile i32 0, ptr addrspace(1) undef ret void } -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 78653d7e21ad81..7ee31bf4dce7cd 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: udiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -52,10 +52,9 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; ; GFX10-LABEL: udiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_sub_i32 s2, 0, s6 @@ -101,8 +100,8 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: udiv32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -172,9 +171,9 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: urem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -216,10 +215,9 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; ; GFX10-LABEL: urem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_sub_i32 s2, 0, s6 @@ -263,8 +261,8 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: urem32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -333,14 +331,14 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s4, s5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_ashr_i32 s3, s5, 31 -; GFX9-NEXT: s_sub_i32 s5, 0, s4 +; GFX9-NEXT: s_abs_i32 s2, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s5, 0, s2 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -351,70 +349,70 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB2_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mul_hi_u32 s6, s2, s5 -; GFX9-NEXT: s_mul_i32 s7, s6, s4 -; GFX9-NEXT: s_sub_i32 s7, s2, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, s5 +; GFX9-NEXT: s_mul_i32 s7, s6, s2 +; GFX9-NEXT: s_sub_i32 s7, s3, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_sub_i32 s9, s7, s4 -; GFX9-NEXT: s_cmp_ge_u32 s7, s4 +; GFX9-NEXT: s_sub_i32 s9, s7, s2 +; GFX9-NEXT: s_cmp_ge_u32 s7, s2 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 ; GFX9-NEXT: s_cselect_b32 s7, s9, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s4 +; GFX9-NEXT: s_cmp_ge_u32 s7, s2 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 -; GFX9-NEXT: s_xor_b32 s6, s6, s3 -; GFX9-NEXT: s_sub_i32 s6, s6, s3 -; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s4 +; GFX9-NEXT: s_add_i32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_abs_i32 s4, s5 -; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s3, 0, s4 +; GFX10-NEXT: s_abs_i32 s2, s3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_sub_i32 s4, 0, s2 +; GFX10-NEXT: s_ashr_i32 s3, s3, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s6, v0 +; GFX10-NEXT: v_readfirstlane_b32 s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s3, s3, s6 -; GFX10-NEXT: s_mul_hi_u32 s5, s6, s3 -; GFX10-NEXT: s_mov_b32 s3, 0 -; GFX10-NEXT: s_add_i32 s5, s6, s5 +; GFX10-NEXT: s_mul_i32 s4, s4, s5 +; GFX10-NEXT: s_mul_hi_u32 s6, s5, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_add_i32 s5, s5, s6 ; GFX10-NEXT: .LBB2_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_mul_hi_u32 s6, s3, s5 -; GFX10-NEXT: s_mul_i32 s7, s6, s4 +; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX10-NEXT: s_mul_i32 s7, s6, s2 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_sub_i32 s7, s3, s7 -; GFX10-NEXT: s_sub_i32 s9, s7, s4 -; GFX10-NEXT: s_cmp_ge_u32 s7, s4 +; GFX10-NEXT: s_sub_i32 s7, s4, s7 +; GFX10-NEXT: s_sub_i32 s9, s7, s2 +; GFX10-NEXT: s_cmp_ge_u32 s7, s2 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 ; GFX10-NEXT: s_cselect_b32 s7, s9, s7 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_cmp_ge_u32 s7, s4 +; GFX10-NEXT: s_cmp_ge_u32 s7, s2 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 -; GFX10-NEXT: s_add_i32 s3, s3, 1 -; GFX10-NEXT: s_xor_b32 s6, s6, s2 -; GFX10-NEXT: s_sub_i32 s6, s6, s2 +; GFX10-NEXT: s_add_i32 s4, s4, 1 +; GFX10-NEXT: s_xor_b32 s6, s6, s3 +; GFX10-NEXT: s_sub_i32 s6, s6, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -422,51 +420,51 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: sdiv32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_abs_i32 s4, s5 -; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX11-NEXT: s_sub_i32 s3, 0, s4 +; GFX11-NEXT: s_abs_i32 s2, s3 +; GFX11-NEXT: s_ashr_i32 s3, s3, 31 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: s_sub_i32 s4, 0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s6, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s3, s3, s6 +; GFX11-NEXT: s_mul_i32 s4, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s6, s3 -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_add_i32 s5, s6, s5 +; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s5, s5, s6 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s6, s3, s5 -; GFX11-NEXT: s_mul_i32 s7, s6, s4 +; GFX11-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX11-NEXT: s_mul_i32 s7, s6, s2 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_sub_i32 s7, s3, s7 +; GFX11-NEXT: s_sub_i32 s7, s4, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s9, s7, s4 -; GFX11-NEXT: s_cmp_ge_u32 s7, s4 +; GFX11-NEXT: s_sub_i32 s9, s7, s2 +; GFX11-NEXT: s_cmp_ge_u32 s7, s2 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 ; GFX11-NEXT: s_cselect_b32 s7, s9, s7 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_cmp_ge_u32 s7, s4 +; GFX11-NEXT: s_cmp_ge_u32 s7, s2 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 -; GFX11-NEXT: s_add_i32 s3, s3, 1 -; GFX11-NEXT: s_xor_b32 s6, s6, s2 +; GFX11-NEXT: s_add_i32 s4, s4, 1 +; GFX11-NEXT: s_xor_b32 s6, s6, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s6, s2 +; GFX11-NEXT: s_sub_i32 s6, s6, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 @@ -492,126 +490,125 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s4, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_sub_i32 s3, 0, s4 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s3, s3, s5 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 -; GFX9-NEXT: s_add_i32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s4, s5, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s3 -; GFX9-NEXT: s_mul_i32 s5, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, s2, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s4 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 +; GFX9-NEXT: s_mul_i32 s5, s5, s2 +; GFX9-NEXT: s_sub_i32 s5, s3, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s4 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: s_add_i32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: srem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_abs_i32 s4, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s2, 0, s4 +; GFX10-NEXT: s_abs_i32 s2, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s2, s2, s3 -; GFX10-NEXT: s_mul_hi_u32 s5, s3, s2 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_add_i32 s3, s3, s5 +; GFX10-NEXT: s_mul_i32 s3, s3, s4 +; GFX10-NEXT: s_mul_hi_u32 s5, s4, s3 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_add_i32 s4, s4, s5 ; GFX10-NEXT: .LBB3_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_mul_hi_u32 s5, s2, s3 -; GFX10-NEXT: s_mul_i32 s5, s5, s4 -; GFX10-NEXT: s_sub_i32 s5, s2, s5 -; GFX10-NEXT: s_sub_i32 s6, s5, s4 -; GFX10-NEXT: s_cmp_ge_u32 s5, s4 +; GFX10-NEXT: s_mul_hi_u32 s5, s3, s4 +; GFX10-NEXT: s_mul_i32 s5, s5, s2 +; GFX10-NEXT: s_sub_i32 s5, s3, s5 +; GFX10-NEXT: s_sub_i32 s6, s5, s2 +; GFX10-NEXT: s_cmp_ge_u32 s5, s2 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_sub_i32 s6, s5, s4 -; GFX10-NEXT: s_cmp_ge_u32 s5, s4 +; GFX10-NEXT: s_sub_i32 s6, s5, s2 +; GFX10-NEXT: s_cmp_ge_u32 s5, s2 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: s_add_i32 s3, s3, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX10-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: srem32_invariant_denom: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_abs_i32 s4, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX11-NEXT: s_sub_i32 s2, 0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_abs_i32 s2, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: s_sub_i32 s3, 0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s2, s2, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s3, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_add_i32 s3, s3, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s3, s3, s4 +; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_add_i32 s4, s4, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB3_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s2, s3 -; GFX11-NEXT: s_mul_i32 s5, s5, s4 +; GFX11-NEXT: s_mul_hi_u32 s5, s3, s4 +; GFX11-NEXT: s_mul_i32 s5, s5, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s5, s2, s5 -; GFX11-NEXT: s_sub_i32 s6, s5, s4 -; GFX11-NEXT: s_cmp_ge_u32 s5, s4 +; GFX11-NEXT: s_sub_i32 s5, s3, s5 +; GFX11-NEXT: s_sub_i32 s6, s5, s2 +; GFX11-NEXT: s_cmp_ge_u32 s5, s2 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s5, s4 -; GFX11-NEXT: s_cmp_ge_u32 s5, s4 +; GFX11-NEXT: s_sub_i32 s6, s5, s2 +; GFX11-NEXT: s_cmp_ge_u32 s5, s2 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5 -; GFX11-NEXT: s_add_i32 s2, s2, 1 +; GFX11-NEXT: s_add_i32 s3, s3, 1 ; GFX11-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 @@ -637,14 +634,14 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX9-NEXT: s_movk_i32 s4, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -658,6 +655,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v5, v4, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -666,12 +664,12 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: udiv16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s0, s4, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB4_1: ; %bb3 @@ -687,7 +685,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v6, s0 -; GFX10-NEXT: global_store_short v5, v4, s[4:5] +; GFX10-NEXT: global_store_short v5, v4, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -695,11 +693,11 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: udiv16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_and_b32 s0, s4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -748,14 +746,14 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: urem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s0, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x400 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -767,11 +765,10 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 ; GFX9-NEXT: v_sub_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 ; GFX9-NEXT: global_store_short v5, v4, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -780,13 +777,13 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: urem16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s2, s4, 0xffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_and_b32 s0, s4, 0xffff +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -798,10 +795,10 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX10-NEXT: v_mul_lo_u32 v4, v4, s0 ; GFX10-NEXT: v_sub_nc_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 -; GFX10-NEXT: global_store_short v5, v4, s[0:1] +; GFX10-NEXT: global_store_short v5, v4, s[2:3] ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX10-NEXT: s_cbranch_vccz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 @@ -810,11 +807,11 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: urem16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -865,19 +862,19 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: sdiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_movk_i32 s2, 0x400 +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s5, s3 +; GFX9-NEXT: s_sext_i32_i16 s5, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 -; GFX9-NEXT: s_xor_b32 s6, s5, s4 +; GFX9-NEXT: s_xor_b32 s6, s5, s2 ; GFX9-NEXT: s_ashr_i32 s5, s6, 30 ; GFX9-NEXT: s_or_b32 s5, s5, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -886,15 +883,14 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: v_add_u16_e64 v2, s3, 1 +; GFX9-NEXT: v_add_u16_e64 v2, s4, 1 ; GFX9-NEXT: s_cselect_b32 s5, s5, 0 -; GFX9-NEXT: s_and_b32 s6, 0xffff, s3 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v2 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 ; GFX9-NEXT: v_add_u32_e32 v2, s5, v4 ; GFX9-NEXT: s_lshl_b32 s5, s6, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -903,19 +899,19 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: sdiv16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s2, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX10-NEXT: s_sext_i32_i16 s0, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_sext_i32_i16 s4, s3 -; GFX10-NEXT: v_add_nc_u16 v2, s3, 1 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: v_add_nc_u16 v2, s1, 1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX10-NEXT: s_xor_b32 s5, s4, s2 +; GFX10-NEXT: s_xor_b32 s5, s4, s0 ; GFX10-NEXT: s_ashr_i32 s4, s5, 30 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -926,12 +922,12 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0| ; GFX10-NEXT: s_and_b32 s5, s5, exec_lo ; GFX10-NEXT: s_cselect_b32 s4, s4, 0 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s3 -; GFX10-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: global_store_short v3, v2, s[0:1] +; GFX10-NEXT: global_store_short v3, v2, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -939,11 +935,11 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: sdiv16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -999,19 +995,19 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: srem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_movk_i32 s2, 0x400 +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s5, s3 +; GFX9-NEXT: s_sext_i32_i16 s5, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 -; GFX9-NEXT: s_xor_b32 s6, s5, s4 +; GFX9-NEXT: s_xor_b32 s6, s5, s2 ; GFX9-NEXT: s_ashr_i32 s6, s6, 30 ; GFX9-NEXT: s_or_b32 s8, s6, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -1020,18 +1016,16 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: v_add_u16_e64 v2, s3, 1 +; GFX9-NEXT: v_add_u16_e64 v2, s4, 1 ; GFX9-NEXT: s_cselect_b32 s6, s8, 0 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 -; GFX9-NEXT: s_and_b32 s7, 0xffff, s3 -; GFX9-NEXT: v_readfirstlane_b32 s3, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s4 +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 ; GFX9-NEXT: v_add_u32_e32 v2, s6, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 ; GFX9-NEXT: s_lshl_b32 s6, s7, 1 -; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -1040,19 +1034,19 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: srem16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s2, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX10-NEXT: s_sext_i32_i16 s0, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_sext_i32_i16 s4, s3 -; GFX10-NEXT: v_add_nc_u16 v2, s3, 1 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: v_add_nc_u16 v2, s1, 1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX10-NEXT: s_xor_b32 s5, s4, s2 +; GFX10-NEXT: s_xor_b32 s5, s4, s0 ; GFX10-NEXT: s_ashr_i32 s5, s5, 30 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -1065,13 +1059,13 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_cselect_b32 s5, s5, 0 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v3, s5, v3 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s3 -; GFX10-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mul_lo_u32 v3, v3, s2 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, s0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3 -; GFX10-NEXT: global_store_short v2, v3, s[0:1] +; GFX10-NEXT: global_store_short v2, v3, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -1079,11 +1073,11 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: srem16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index 011a366267afe1..9da07ea04ded59 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -12,8 +12,8 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -40,8 +40,8 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -69,49 +69,48 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -150,8 +149,8 @@ entry: define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MulMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -179,8 +178,8 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MulMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -207,46 +206,45 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MulMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s0 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MulMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MulMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -290,8 +288,8 @@ entry: define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -318,8 +316,8 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -347,49 +345,48 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -425,8 +422,8 @@ entry: define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MixedTypedMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -453,8 +450,8 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MixedTypedMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -482,46 +479,45 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MixedTypedMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedTypedMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedTypedMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -566,8 +562,8 @@ entry: define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_alt_AddOperands: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -594,8 +590,8 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_alt_AddOperands: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -623,13 +619,13 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_alt_AddOperands: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -638,38 +634,37 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_alt_AddOperands: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -705,8 +700,8 @@ entry: define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MixedExt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -733,8 +728,8 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MixedExt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -762,46 +757,45 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MixedExt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedExt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedExt: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -846,8 +840,8 @@ entry: define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_SameVec: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -872,8 +866,8 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_SameVec: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -899,48 +893,47 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_SameVec: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_SameVec: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v2, s2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v2, s0, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_SameVec: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -986,8 +979,8 @@ entry: define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_v4i16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1015,8 +1008,8 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_v4i16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1044,49 +1037,48 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_v4i16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -1122,8 +1114,8 @@ entry: define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_v4i16_Hi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 @@ -1150,8 +1142,8 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_v4i16_Hi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1183,49 +1175,48 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_v4i16_Hi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16_Hi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -1261,8 +1252,8 @@ entry: define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_v4i16_Even: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1290,8 +1281,8 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_v4i16_Even: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1319,46 +1310,45 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_v4i16_Even: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-NODL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-NODL-NEXT: global_store_dword v4, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Even: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-DL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-DL-NEXT: global_store_dword v4, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Even: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -1403,8 +1393,8 @@ entry: define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_v4i16_Middle: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1432,8 +1422,8 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_v4i16_Middle: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1461,46 +1451,45 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_v4i16_Middle: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-NODL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-NODL-NEXT: global_store_dword v4, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Middle: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-DL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-DL-NEXT: global_store_dword v4, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Middle: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -1545,8 +1534,8 @@ entry: define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_DiffIndex: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1573,8 +1562,8 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_DiffIndex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1602,46 +1591,45 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_DiffIndex: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_DiffIndex: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_DiffIndex: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1686,8 +1674,8 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1715,8 +1703,8 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1745,50 +1733,49 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1838,8 +1825,8 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1867,8 +1854,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1897,50 +1884,49 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1990,8 +1976,8 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2019,8 +2005,8 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2049,13 +2035,13 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -2064,20 +2050,20 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -2086,17 +2072,16 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2148,8 +2133,8 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2177,8 +2162,8 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2207,13 +2192,13 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16 @@ -2222,20 +2207,20 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16 @@ -2244,17 +2229,16 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2306,8 +2290,8 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2335,8 +2319,8 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2365,13 +2349,13 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2379,20 +2363,20 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2400,17 +2384,16 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2462,8 +2445,8 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2491,8 +2474,8 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2521,13 +2504,13 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2535,20 +2518,20 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2556,17 +2539,16 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2618,8 +2600,8 @@ entry: define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2646,8 +2628,8 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2673,14 +2655,14 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -2688,19 +2670,19 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-NODL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -2708,21 +2690,21 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -2730,7 +2712,7 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2759,8 +2741,8 @@ entry: define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX7-LABEL: notsdot2_sext8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2787,8 +2769,8 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notsdot2_sext8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2818,13 +2800,13 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notsdot2_sext8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -2832,36 +2814,35 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notsdot2_sext8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0001 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notsdot2_sext8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index 1d68b0ba0a2800..fdd913867c8f89 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -44,8 +44,8 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -79,13 +79,13 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -93,32 +93,31 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -134,13 +133,10 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -197,8 +193,8 @@ entry: define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -239,8 +235,8 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -280,14 +276,14 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_bfe_i32 v6, v1, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -309,49 +305,47 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_sshort v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_sshort v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_sshort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_sshort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v2, v3 -; GFX10-DL-NEXT: global_store_short v1, v4, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v4, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc16: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -408,8 +402,8 @@ entry: define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -442,8 +436,8 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -475,14 +469,14 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -496,49 +490,47 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -587,8 +579,8 @@ entry: define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_multiuse_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -622,8 +614,8 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -658,13 +650,13 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -675,38 +667,37 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v3, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s0 ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -725,12 +716,9 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_multiuse_mul1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -796,8 +784,8 @@ entry: define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -830,8 +818,8 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -867,13 +855,13 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 @@ -884,32 +872,31 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v5, s2, v3 +; GFX9-NODL-NEXT: v_add3_u32 v2, v5, s0, v3 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -925,13 +912,10 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -974,8 +958,8 @@ entry: define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1016,8 +1000,8 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1053,15 +1037,15 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-NODL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1070,35 +1054,35 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s2 -; GFX9-NODL-NEXT: v_perm_b32 v1, v6, v1, s2 +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s0 +; GFX9-NODL-NEXT: v_perm_b32 v1, v6, v1, s0 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: v_perm_b32 v5, v9, v5, s2 -; GFX9-NODL-NEXT: v_perm_b32 v4, v8, v4, s2 +; GFX9-NODL-NEXT: v_perm_b32 v5, v9, v5, s0 +; GFX9-NODL-NEXT: v_perm_b32 v4, v8, v4, s0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1107,30 +1091,29 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v6, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v6, v1, s0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 ; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_perm_b32 v5, v9, v5, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v8, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v5, v9, v5, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v8, v4, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1167,13 +1150,10 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc16_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1245,8 +1225,8 @@ entry: define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_2ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1273,8 +1253,8 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_2ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1302,48 +1282,47 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_2ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_2ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_2ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1362,13 +1341,10 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_2ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -1417,8 +1393,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1448,8 +1424,8 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1480,13 +1456,13 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -1495,37 +1471,36 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1544,13 +1519,10 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -1606,8 +1578,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele_permuted: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1637,8 +1609,8 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele_permuted: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1669,13 +1641,13 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v3, 24, v1 @@ -1684,37 +1656,36 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020003 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020003 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1733,13 +1704,10 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -1794,8 +1762,8 @@ entry: define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_opt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1826,8 +1794,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_opt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1857,8 +1825,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_opt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1873,13 +1841,13 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_opt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1887,15 +1855,14 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_opt: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1909,13 +1876,10 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_opt: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1969,7 +1933,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -2006,7 +1970,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2044,7 +2008,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -2067,7 +2031,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_3src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -2089,7 +2053,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_3src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -2112,9 +2076,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -2181,7 +2143,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -2215,7 +2177,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2250,7 +2212,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -2272,7 +2234,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_3src_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -2296,7 +2258,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_3src_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -2320,9 +2282,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3src_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -2383,44 +2343,44 @@ entry: define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_bad_source: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_mov_b32 s14, 0 -; GFX7-NEXT: s_mov_b32 s15, s11 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s12, s[0:1], 0xf +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 -; GFX7-NEXT: s_sext_i32_i16 s0, s0 -; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 +; GFX7-NEXT: s_sext_i32_i16 s5, s12 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8 -; GFX7-NEXT: v_mad_i32_i24 v1, v3, s0, v1 +; GFX7-NEXT: v_mad_i32_i24 v1, v3, s5, v1 ; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8 ; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8 ; GFX7-NEXT: v_mad_i32_i24 v1, v4, v5, v1 ; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_bad_source: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2430,14 +2390,14 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_sext_i32_i16 s3, s8 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v2, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v2, s3, v1 +; GFX8-NEXT: v_mad_i32_i24 v1, v2, s2, v1 ; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 @@ -2451,49 +2411,49 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_bad_source: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_sext_i32_i16 s3, s8 +; GFX9-NODL-NEXT: s_sext_i32_i16 s2, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s2, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_bad_source: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_sext_i32_i16 s4, s8 +; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201 +; GFX9-DL-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s4, v3 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s2, v3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4 ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm @@ -2501,24 +2461,24 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX10-DL-LABEL: idot4_bad_source: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_sext_i32_i16 s3, s8 -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201 ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s3, s2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s2, s3 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -2526,25 +2486,23 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX11-DL-LABEL: idot4_bad_source: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b32 s8, s[2:3], 0x3c -; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c +; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_sext_i32_i16 s3, s8 -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_sext_i32_i16 s2, s2 +; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201 ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s3, s2 +; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s2, s3 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -2596,8 +2554,8 @@ entry: define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_commutative: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2627,8 +2585,8 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_commutative: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2659,13 +2617,13 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_commutative: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -2674,37 +2632,36 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_commutative: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_commutative: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -2723,13 +2680,10 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_commutative: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -2789,7 +2743,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src_3ele_src0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -2822,7 +2776,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src_3ele_src0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2856,7 +2810,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3] @@ -2878,7 +2832,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2902,7 +2856,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -2926,9 +2880,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -2991,25 +2943,25 @@ entry: define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_4src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s15, 0xf000 -; GFX7-NEXT: s_mov_b32 s18, 0 -; GFX7-NEXT: s_mov_b32 s19, s15 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s14, 0 +; GFX7-NEXT: s_mov_b32 s15, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[6:7] -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x11 -; GFX7-NEXT: s_mov_b32 s14, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[8:9] +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[12:13], 0x0 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 @@ -3017,7 +2969,7 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v5, v3, 0, 8 ; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s0 +; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v2, v4, 0, 8 ; GFX7-NEXT: v_bfe_i32 v4, v4, 8, 8 @@ -3027,14 +2979,14 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 ; GFX7-NEXT: v_mad_i32_i24 v1, v2, v4, v1 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_4src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -3077,9 +3029,9 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_4src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] @@ -3103,9 +3055,9 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_4src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501 ; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -3132,9 +3084,9 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_4src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x3 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3159,11 +3111,9 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_4src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x3 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -3243,8 +3193,8 @@ entry: define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_nonstandard_signed: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3280,8 +3230,8 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_nonstandard_signed: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3317,10 +3267,10 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_nonstandard_signed: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] @@ -3333,7 +3283,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 @@ -3342,15 +3292,15 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_nonstandard_signed: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] @@ -3363,7 +3313,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 @@ -3372,16 +3322,15 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_nonstandard_signed: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v6, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3411,12 +3360,9 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_nonstandard_signed: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index fb94b504781b10..0b131ea74f1abb 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -44,8 +44,8 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -79,13 +79,13 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -93,38 +93,37 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -132,13 +131,10 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -196,8 +192,8 @@ entry: define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -230,8 +226,8 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -268,15 +264,15 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -287,56 +283,54 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc16: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -394,8 +388,8 @@ entry: define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -428,8 +422,8 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -461,14 +455,14 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -482,49 +476,47 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -574,8 +566,8 @@ entry: define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -602,8 +594,8 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -629,14 +621,14 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -644,59 +636,57 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2 -; GFX9-NODL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0100 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot2_8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -740,8 +730,8 @@ entry: define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -774,8 +764,8 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -807,14 +797,14 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -828,49 +818,47 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_CommutationInsideMAD: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -920,8 +908,8 @@ entry: define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_CommutationAccrossMADs: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -954,8 +942,8 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_CommutationAccrossMADs: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -987,14 +975,14 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1008,49 +996,47 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -1100,8 +1086,8 @@ entry: define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_multiuse_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1135,8 +1121,8 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1171,13 +1157,13 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -1188,38 +1174,37 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1238,12 +1223,9 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_multiuse_mul1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1309,8 +1291,8 @@ entry: define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_multiuse_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1345,8 +1327,8 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_multiuse_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1382,13 +1364,13 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_multiuse_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8 @@ -1398,38 +1380,37 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s2 -; GFX9-NODL-NEXT: v_add_u32_e32 v4, s2, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s0 +; GFX9-NODL-NEXT: v_add_u32_e32 v4, s0, v2 ; GFX9-NODL-NEXT: v_add3_u32 v2, v2, v3, v6 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v1, v4 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_add_i32 s3, s2, s2 +; GFX9-DL-NEXT: s_add_i32 s1, s0, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: v_add3_u32 v1, s3, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: v_add3_u32 v1, s1, v3, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_add1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1447,12 +1428,9 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_multiuse_add1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1520,8 +1498,8 @@ entry: define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX7-LABEL: notdot4_mixedtypes: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1556,8 +1534,8 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notdot4_mixedtypes: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1594,15 +1572,15 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notdot4_mixedtypes: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1613,27 +1591,27 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0302 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0302 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1644,25 +1622,25 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1678,16 +1656,14 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: notdot4_mixedtypes: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 @@ -1763,8 +1739,8 @@ entry: define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX7-LABEL: notdot4_mixedtypes2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1801,8 +1777,8 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notdot4_mixedtypes2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1841,15 +1817,15 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notdot4_mixedtypes2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1861,7 +1837,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v8, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v5, v6, v3 @@ -1869,20 +1845,20 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1894,7 +1870,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v8, v3 -; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v5, v6, v3 @@ -1902,14 +1878,14 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -1917,7 +1893,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1937,16 +1913,14 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: notdot4_mixedtypes2: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 @@ -2027,8 +2001,8 @@ entry: define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2061,8 +2035,8 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2096,13 +2070,13 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -2110,38 +2084,37 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -2149,13 +2122,10 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -2199,8 +2169,8 @@ entry: define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2237,8 +2207,8 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2273,16 +2243,16 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff -; GFX9-NODL-NEXT: s_mov_b32 s3, 0x5040100 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_mov_b32 s1, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -2290,13 +2260,13 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NODL-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xff, v2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s3 -; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s3 -; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s3 -; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s3 +; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s1 +; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s1 +; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s1 +; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s1 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) @@ -2304,21 +2274,21 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff -; GFX9-DL-NEXT: s_mov_b32 s3, 0x5040100 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_mov_b32 s1, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -2326,13 +2296,13 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xff, v2 -; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s3 -; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s3 -; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s3 -; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s3 +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s1 +; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s1 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -2340,15 +2310,14 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -2384,12 +2353,9 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc16_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -2461,8 +2427,8 @@ entry: define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2495,8 +2461,8 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2532,14 +2498,14 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc8_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -2555,19 +2521,19 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -2583,15 +2549,14 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2625,13 +2590,10 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc8_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -2700,8 +2662,8 @@ entry: define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_2ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2728,8 +2690,8 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_2ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2757,48 +2719,47 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_2ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_2ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_2ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -2816,13 +2777,10 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_2ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -2870,8 +2828,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2901,8 +2859,8 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2933,13 +2891,13 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -2948,37 +2906,36 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -2996,13 +2953,10 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -3057,8 +3011,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele_permuted: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3088,8 +3042,8 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele_permuted: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3120,13 +3074,13 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v1 @@ -3135,37 +3089,36 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020003 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020003 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -3183,13 +3136,10 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -3245,8 +3195,8 @@ entry: define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_opt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3277,8 +3227,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_opt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3308,8 +3258,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_opt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3324,13 +3274,13 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_opt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3338,15 +3288,14 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_opt: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3359,13 +3308,10 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_opt: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -3419,7 +3365,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -3456,7 +3402,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3494,7 +3440,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -3517,7 +3463,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_acc32_3src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3539,7 +3485,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_acc32_3src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -3561,9 +3507,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_3src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -3631,7 +3575,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -3665,7 +3609,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3700,7 +3644,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -3722,7 +3666,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_acc32_3src_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3746,7 +3690,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_acc32_3src_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -3769,9 +3713,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_3src_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -3834,44 +3776,44 @@ entry: define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_bad_source: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_mov_b32 s14, 0 -; GFX7-NEXT: s_mov_b32 s15, s11 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s12, s[0:1], 0xf +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 +; GFX7-NEXT: s_and_b32 s5, s12, 0xffff +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v3, s0, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, s5, v1 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v5, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_bad_source: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -3881,14 +3823,14 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_and_b32 s3, s8, 0xffff +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v2, s3, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, v2, s2, v1 ; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 @@ -3902,49 +3844,49 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_bad_source: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_and_b32 s3, s8, 0xffff +; GFX9-NODL-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s2, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_bad_source: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_and_b32 s4, s8, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201 +; GFX9-DL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s4, v3 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s2, v3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm @@ -3952,24 +3894,24 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX10-DL-LABEL: udot4_bad_source: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_and_b32 s3, s8, 0xffff -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201 ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s3, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s2, s3 ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -3977,25 +3919,23 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX11-DL-LABEL: udot4_bad_source: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b32 s8, s[2:3], 0x3c -; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c +; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_and_b32 s3, s8, 0xffff -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201 ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s3, s2 +; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s2, s3 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2 ; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -4047,8 +3987,8 @@ entry: define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_commutative: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -4078,8 +4018,8 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_commutative: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -4110,13 +4050,13 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_commutative: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -4125,37 +4065,36 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_commutative: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_commutative: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -4173,13 +4112,10 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_commutative: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -4239,7 +4175,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src_3ele_src0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -4272,7 +4208,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src_3ele_src0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -4306,7 +4242,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3] @@ -4328,7 +4264,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -4352,7 +4288,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -4375,9 +4311,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -4440,25 +4374,25 @@ entry: define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_4src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s15, 0xf000 -; GFX7-NEXT: s_mov_b32 s18, 0 -; GFX7-NEXT: s_mov_b32 s19, s15 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s14, 0 +; GFX7-NEXT: s_mov_b32 s15, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[6:7] -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x11 -; GFX7-NEXT: s_mov_b32 s14, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[8:9] +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[12:13], 0x0 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 @@ -4466,7 +4400,7 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s0 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 8, 8 @@ -4476,14 +4410,14 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v2, v4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_4src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -4526,9 +4460,9 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_4src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] @@ -4552,9 +4486,9 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_4src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501 ; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -4581,9 +4515,9 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_4src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x3 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -4607,11 +4541,9 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_4src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x3 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -4691,8 +4623,8 @@ entry: define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_multi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -4734,8 +4666,8 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_multi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -4777,13 +4709,13 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_multi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v3, v2, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -4795,45 +4727,44 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v9, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v3, v4, s2, v6 +; GFX9-NODL-NEXT: v_add3_u32 v3, v4, s0, v6 ; GFX9-NODL-NEXT: v_add3_u32 v3, v3, v7, v9 ; GFX9-NODL-NEXT: v_add3_u32 v0, v5, v3, v0 ; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v8, v1 -; GFX9-NODL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_multi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x6040200 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x2000200 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x6040200 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x2000200 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v2, s[6:7] -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s4, 0x7050301 ; GFX9-DL-NEXT: s_mov_b32 s6, 0x3010301 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v4, v1, v0, s2 +; GFX9-DL-NEXT: v_perm_b32 v4, v1, v0, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v5, v3, v3, s3 +; GFX9-DL-NEXT: v_perm_b32 v5, v3, v3, s1 ; GFX9-DL-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v4, v5, s5 ; GFX9-DL-NEXT: v_perm_b32 v3, v3, v3, s6 ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v3, v1 -; GFX9-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_multi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] @@ -4854,12 +4785,9 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_multi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b64 v[0:1], v2, s[4:5] @@ -4954,8 +4882,8 @@ entry: define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_hilo: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -4986,8 +4914,8 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_hilo: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -5019,8 +4947,8 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_hilo: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -5035,13 +4963,13 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_hilo: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -5049,15 +4977,14 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_hilo: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -5070,13 +4997,10 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_hilo: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] offset:4 @@ -5131,8 +5055,8 @@ entry: define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_lohi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -5163,8 +5087,8 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_lohi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -5196,8 +5120,8 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_lohi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] @@ -5212,34 +5136,33 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_lohi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x10302 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x3020001 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x10302 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_lohi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 @@ -5255,13 +5178,10 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_lohi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4 @@ -5320,8 +5240,8 @@ entry: define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_hihi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 @@ -5352,8 +5272,8 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_hihi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -5387,8 +5307,8 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_hihi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -5403,34 +5323,33 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_hihi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x1030200 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x3010002 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x1030200 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3010002 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_hihi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 @@ -5446,13 +5365,10 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_hihi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4 @@ -5511,16 +5427,16 @@ entry: define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v8i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, s7 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v4, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8 @@ -5534,17 +5450,17 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_mad_u32_u24 v2, v6, v7, v2 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc32_v8i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -5558,19 +5474,19 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_v8i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 @@ -5584,12 +5500,12 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_v8i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 ; GFX9-DL-NEXT: global_store_dword v2, v0, s[4:5] @@ -5597,30 +5513,28 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_v8i8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_v8i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[0:1] +; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-DL-NEXT: s_nop 0 ; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm @@ -5667,8 +5581,8 @@ entry: define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v16i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -5702,8 +5616,8 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_v16i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5734,8 +5648,8 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_v16i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NODL-NEXT: ; kill: killed $vgpr5 @@ -5754,42 +5668,41 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NODL-NEXT: v_add3_u32 v0, v2, v6, v0 -; GFX9-NODL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_v16i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x7050002 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x7050002 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] ; GFX9-DL-NEXT: global_load_dword v0, v5, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s3, 0x3020001 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX9-DL-NEXT: ; kill: killed $vgpr5 ; GFX9-DL-NEXT: ; kill: killed $vgpr4 -; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s2 +; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 +; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s3 +; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, 0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_v16i8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; kill: killed $vgpr5 ; GFX10-DL-NEXT: ; kill: killed $vgpr4 +; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] ; GFX10-DL-NEXT: global_load_dword v0, v5, s[6:7] @@ -5804,13 +5717,10 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_v16i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[4:5] ; GFX11-DL-NEXT: global_load_b32 v0, v4, s[6:7] @@ -5869,8 +5779,8 @@ entry: define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v256i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -5903,8 +5813,8 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_v256i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xfc @@ -5938,8 +5848,8 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_v256i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) @@ -5955,36 +5865,35 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v6, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_v256i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x3020001 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v3, v1, s[4:5] offset:252 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x1000302 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x1000302 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v2, v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_v256i8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_dword v2, v1, s[6:7] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] offset:252 @@ -5999,13 +5908,10 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_v256i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: global_load_b32 v1, v1, s[6:7] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:252 @@ -6063,8 +5969,8 @@ entry: define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_anyext: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -6090,8 +5996,8 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_anyext: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -6118,49 +6024,48 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_anyext: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_anyext: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0500 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0500 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0100 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v1, s1 ; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_anyext: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -6177,13 +6082,10 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_anyext: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index 99bb4d50b03d4c..8c53d2671de3f6 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -12,13 +12,13 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -63,11 +63,11 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -78,10 +78,10 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 @@ -116,20 +116,20 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -154,55 +154,54 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v9, v10 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v12 ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v13, v14 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v6 ; GFX9-NEXT: v_mul_i32_i24_e32 v9, v15, v16 ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v8 ; GFX9-NEXT: v_add3_u32 v1, v2, v9, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc32: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 ; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] @@ -210,17 +209,16 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc32: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -307,13 +305,13 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -374,11 +372,11 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -391,11 +389,11 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 @@ -454,21 +452,21 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -522,26 +520,26 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -595,21 +593,20 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc16: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -679,16 +676,16 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-LABEL: idot8_acc16: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -829,13 +826,13 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc8: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -896,11 +893,11 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -913,11 +910,11 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 @@ -976,21 +973,21 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1044,26 +1041,26 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1117,21 +1114,20 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc8: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1201,16 +1197,16 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-LABEL: idot8_acc8: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1352,13 +1348,13 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1405,11 +1401,11 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1420,10 +1416,10 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 @@ -1460,20 +1456,20 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1494,7 +1490,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v6 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v7, v8 ; GFX9-NEXT: v_mad_i32_i24 v3, v3, v4, v2 @@ -1506,25 +1502,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8 ; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10 ; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -1545,7 +1541,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v6 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v7, v8 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, v2 @@ -1557,21 +1553,20 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v7, v8 ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v9, v10 ; GFX9-DL-NEXT: v_add3_u32 v1, v3, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1615,16 +1610,15 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1743,13 +1737,13 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1794,11 +1788,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1809,10 +1803,10 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4 @@ -1847,20 +1841,20 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1 ; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4 @@ -1884,7 +1878,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -1892,48 +1886,47 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 ; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] @@ -1941,17 +1934,16 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -2002,13 +1994,13 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2069,11 +2061,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2086,11 +2078,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 @@ -2149,22 +2141,22 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 @@ -2199,9 +2191,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v2 -; GFX9-NEXT: v_perm_b32 v7, v8, v7, s2 -; GFX9-NEXT: v_perm_b32 v8, v13, v12, s2 -; GFX9-NEXT: v_perm_b32 v5, v6, v5, s2 +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s0 +; GFX9-NEXT: v_perm_b32 v8, v13, v12, s0 +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s0 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 @@ -2213,11 +2205,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v8 -; GFX9-NEXT: v_perm_b32 v2, v2, v4, s2 -; GFX9-NEXT: v_perm_b32 v1, v1, v11, s2 -; GFX9-NEXT: v_perm_b32 v4, v17, v16, s2 -; GFX9-NEXT: v_perm_b32 v9, v10, v9, s2 -; GFX9-NEXT: v_perm_b32 v10, v15, v14, s2 +; GFX9-NEXT: v_perm_b32 v2, v2, v4, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v17, v16, s0 +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s0 +; GFX9-NEXT: v_perm_b32 v10, v15, v14, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2230,27 +2222,27 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 @@ -2285,9 +2277,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v17 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v2 -; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s2 -; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s2 -; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s2 +; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s0 +; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s0 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 @@ -2299,11 +2291,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v8 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v17, v16, s2 -; GFX9-DL-NEXT: v_perm_b32 v9, v10, v9, s2 -; GFX9-DL-NEXT: v_perm_b32 v10, v15, v14, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v17, v16, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v10, v9, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v15, v14, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2316,21 +2308,20 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -2415,17 +2406,16 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -2547,13 +2537,13 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2614,11 +2604,11 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2631,11 +2621,11 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3 @@ -2714,21 +2704,21 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 @@ -2801,26 +2791,26 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 ; GFX9-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 @@ -2893,22 +2883,21 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -2999,17 +2988,16 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 779107cc40e1fb..3828fa557731e8 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -10,13 +10,13 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -61,11 +61,11 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -76,10 +76,10 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 @@ -114,20 +114,20 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 @@ -151,7 +151,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -159,48 +159,47 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -282,13 +281,13 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -333,11 +332,11 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -350,10 +349,10 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 @@ -386,20 +385,20 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -427,25 +426,25 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -473,27 +472,27 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -521,7 +520,7 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -600,13 +599,13 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc8: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -651,11 +650,11 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -668,10 +667,10 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 @@ -704,20 +703,20 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -745,25 +744,25 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -791,27 +790,27 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -839,7 +838,7 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -918,13 +917,13 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc4: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -970,11 +969,11 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -987,10 +986,10 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 @@ -1024,20 +1023,20 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc4: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1066,25 +1065,25 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1113,27 +1112,27 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1162,7 +1161,7 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1225,13 +1224,13 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1277,11 +1276,11 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1294,10 +1293,10 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 @@ -1331,20 +1330,20 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_CommutationInsideMAD: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1373,25 +1372,25 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1420,27 +1419,27 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1469,7 +1468,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1530,13 +1529,13 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1583,11 +1582,11 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1598,10 +1597,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 @@ -1638,20 +1637,20 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_u32 v3, v1, 4, 4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -1672,7 +1671,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v17, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mad_u32_u24 v2, v3, v10, v1 @@ -1684,25 +1683,25 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4 ; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 4, 4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -1723,7 +1722,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v10, v1 @@ -1735,21 +1734,20 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4 ; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1867,13 +1865,13 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1918,11 +1916,11 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1933,10 +1931,10 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 @@ -1971,20 +1969,20 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 @@ -2008,7 +2006,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -2016,48 +2014,47 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -2104,13 +2101,13 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2155,11 +2152,11 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2172,10 +2169,10 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 @@ -2208,21 +2205,21 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2231,16 +2228,16 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2251,9 +2248,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2262,26 +2259,26 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2290,16 +2287,16 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2310,9 +2307,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2321,21 +2318,20 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2428,13 +2424,13 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2479,11 +2475,11 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2496,10 +2492,10 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3 ; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4 @@ -2552,20 +2548,20 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v4, v3, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 @@ -2612,25 +2608,25 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v7 -; GFX9-NEXT: global_store_byte v3, v0, s[0:1] +; GFX9-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 @@ -2677,22 +2673,21 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7 -; GFX9-DL-NEXT: global_store_byte v3, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2791,13 +2786,13 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc4_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2843,11 +2838,11 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc4_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2860,10 +2855,10 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 @@ -2897,21 +2892,21 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2920,16 +2915,16 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2940,9 +2935,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2952,26 +2947,26 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2980,16 +2975,16 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -3000,9 +2995,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -3012,21 +3007,20 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3115,8 +3109,8 @@ entry: define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX7-LABEL: udot8_variant1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3161,8 +3155,8 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; ; GFX8-LABEL: udot8_variant1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3208,13 +3202,13 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; ; GFX9-LABEL: udot8_variant1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v3, 15, v1 @@ -3239,7 +3233,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v6, v5 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v8, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v3, s2, v1 +; GFX9-NEXT: v_add3_u32 v1, v3, s0, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v10, v9 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v12, v11 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v5 @@ -3247,36 +3241,35 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v16, v15 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v7 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v9 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_variant1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll index 0f40d010e2a3a9..f7a0e296fa1733 100644 --- a/llvm/test/CodeGen/AMDGPU/imm.ll +++ b/llvm/test/CodeGen/AMDGPU/imm.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) { ; SI-LABEL: i64_imm_inline_lo: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 5 @@ -17,7 +17,7 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) { ; ; VI-LABEL: i64_imm_inline_lo: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 5 @@ -34,7 +34,7 @@ entry: define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) { ; SI-LABEL: i64_imm_inline_hi: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x12345678 @@ -45,7 +45,7 @@ define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) { ; ; VI-LABEL: i64_imm_inline_hi: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x12345678 @@ -61,7 +61,7 @@ entry: define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { ; SI-LABEL: store_imm_neg_0.0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -72,7 +72,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -87,7 +87,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_neg_0.0_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -97,7 +97,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_neg_0.0_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -111,7 +111,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -121,7 +121,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -135,7 +135,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_imm_neg_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -145,7 +145,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -159,7 +159,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0.5 @@ -169,7 +169,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0.5 @@ -183,7 +183,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -0.5 @@ -193,7 +193,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -0.5 @@ -207,7 +207,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -217,7 +217,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -231,7 +231,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -1.0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1.0 @@ -255,7 +255,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 2.0 @@ -265,7 +265,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 2.0 @@ -279,7 +279,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -2.0 @@ -289,7 +289,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -2.0 @@ -303,7 +303,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 4.0 @@ -313,7 +313,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 4.0 @@ -327,7 +327,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -4.0 @@ -337,7 +337,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -4.0 @@ -351,7 +351,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_inv_2pi_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e22f983 @@ -361,7 +361,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_inv_2pi_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0.15915494 @@ -375,7 +375,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_inv_2pi_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xbe22f983 @@ -385,7 +385,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xbe22f983 @@ -399,7 +399,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x45800000 @@ -409,7 +409,7 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x45800000 @@ -423,8 +423,8 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -434,8 +434,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -450,8 +450,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -461,8 +461,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -477,8 +477,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -488,8 +488,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -504,8 +504,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -515,8 +515,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -531,8 +531,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -542,8 +542,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -558,8 +558,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -569,8 +569,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -585,8 +585,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -596,8 +596,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -612,8 +612,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -623,8 +623,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -639,8 +639,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -650,8 +650,8 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -666,7 +666,7 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: commute_add_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -684,7 +684,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, ; ; VI-LABEL: commute_add_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -708,7 +708,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: commute_add_literal_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -726,7 +726,7 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: commute_add_literal_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -750,8 +750,8 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -761,8 +761,8 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) ; ; VI-LABEL: add_inline_imm_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -777,8 +777,8 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_2_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -788,8 +788,8 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) ; ; VI-LABEL: add_inline_imm_2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -804,8 +804,8 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -815,8 +815,8 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: add_inline_imm_16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -831,8 +831,8 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -843,8 +843,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float ; ; VI-LABEL: add_inline_imm_neg_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -862,8 +862,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_2_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -874,8 +874,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float ; ; VI-LABEL: add_inline_imm_neg_2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -893,8 +893,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -905,8 +905,8 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa ; ; VI-LABEL: add_inline_imm_neg_16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -924,8 +924,8 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_63_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -935,8 +935,8 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: add_inline_imm_63_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -951,8 +951,8 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_64_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -962,8 +962,8 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: add_inline_imm_64_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -978,25 +978,23 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0.0 @@ -1007,25 +1005,23 @@ define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 0.5 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 0.5 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0.5 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.5 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0.5 @@ -1036,25 +1032,23 @@ define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -0.5 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], -0.5 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -0.5 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], -0.5 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -0.5 @@ -1065,25 +1059,23 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 1.0 @@ -1094,25 +1086,23 @@ define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -1.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], -1.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -1.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], -1.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -1.0 @@ -1123,25 +1113,23 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 2.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 2.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 2.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 2.0 @@ -1152,25 +1140,23 @@ define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -2.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], -2.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -2.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], -2.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -2.0 @@ -1181,25 +1167,23 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 4.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 4.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 4.0 @@ -1210,25 +1194,23 @@ define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -4.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], -4.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -4.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], -4.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -4.0 @@ -1239,27 +1221,25 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; SI-NEXT: v_mov_b32_e32 v1, 0x3fc45f30 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0.15915494309189532 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.15915494309189532 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x3fc45f306dc9c882 @@ -1270,29 +1250,27 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_m_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; SI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_m_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; VI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0xbfc45f306dc9c882 @@ -1303,25 +1281,23 @@ define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], d define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 1 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000001 @@ -1332,25 +1308,23 @@ define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32] define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 2 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000002 @@ -1361,25 +1335,23 @@ define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32] define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_16_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 16 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 16 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_16_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 16 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 16 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000010 @@ -1390,7 +1362,7 @@ define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, -1 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1401,7 +1373,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, -1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1417,7 +1389,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -2 @@ -1428,7 +1400,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_2_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -2 @@ -1444,7 +1416,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_16_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -16 @@ -1455,7 +1427,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_16_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -16 @@ -1471,25 +1443,23 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_63_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 63 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 63 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_63_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 63 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 63 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x000000000000003F @@ -1500,25 +1470,23 @@ define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_64_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 64 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_64_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 64 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000040 @@ -1529,7 +1497,7 @@ define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1540,7 +1508,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1555,7 +1523,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_neg_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1566,7 +1534,7 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) ; ; VI-LABEL: store_literal_imm_neg_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1581,7 +1549,7 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1592,7 +1560,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1607,7 +1575,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1618,7 +1586,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1633,7 +1601,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1644,7 +1612,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1659,7 +1627,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1670,7 +1638,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1685,7 +1653,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1696,7 +1664,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1711,7 +1679,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1722,7 +1690,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1737,7 +1705,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1748,7 +1716,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1763,7 +1731,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1774,7 +1742,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1789,7 +1757,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1800,7 +1768,7 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1815,7 +1783,7 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1826,7 +1794,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1841,7 +1809,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1852,7 +1820,7 @@ define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll index f407a1c26dd3eb..dcc615232e56be 100644 --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_neg_0.0_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -20,7 +20,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_neg_0.0_i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -33,7 +33,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_neg_0.0_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] @@ -44,7 +44,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_neg_0.0_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -59,7 +59,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_0.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -69,7 +69,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_0.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -81,7 +81,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] @@ -91,7 +91,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -105,7 +105,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_imm_neg_0.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -115,7 +115,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_imm_neg_0.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -127,7 +127,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] @@ -137,7 +137,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_imm_neg_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -151,7 +151,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -161,7 +161,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -173,7 +173,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] @@ -183,7 +183,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3800 @@ -197,7 +197,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -207,7 +207,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -219,7 +219,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] @@ -229,7 +229,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xb800 @@ -243,7 +243,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_1.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -253,7 +253,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_1.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -265,7 +265,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] @@ -275,7 +275,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 @@ -289,7 +289,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_1.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -299,7 +299,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_1.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -311,7 +311,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] @@ -321,7 +321,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xbc00 @@ -335,7 +335,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_2.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -345,7 +345,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_2.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -357,7 +357,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] @@ -367,7 +367,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x4000 @@ -381,7 +381,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_2.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -391,7 +391,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_2.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -403,7 +403,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] @@ -413,7 +413,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xc000 @@ -427,7 +427,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_4.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -437,7 +437,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_4.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -449,7 +449,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] @@ -459,7 +459,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x4400 @@ -473,7 +473,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_4.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -483,7 +483,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_4.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -495,7 +495,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] @@ -505,7 +505,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xc400 @@ -519,7 +519,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_inv_2pi_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -529,7 +529,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_inv_2pi_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -541,7 +541,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_inv_2pi_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] @@ -551,7 +551,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_inv_2pi_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3118 @@ -565,7 +565,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_inv_2pi_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -575,7 +575,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: store_inline_imm_m_inv_2pi_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -587,7 +587,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] @@ -597,7 +597,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; SI-LABEL: store_inline_imm_m_inv_2pi_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xb118 @@ -611,7 +611,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_literal_imm_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -621,7 +621,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_literal_imm_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -633,7 +633,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] @@ -643,7 +643,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_literal_imm_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6c00 @@ -658,8 +658,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_0.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] @@ -670,12 +670,12 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_0.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x00,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -683,26 +683,25 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x00,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x00,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0.0 @@ -714,8 +713,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_0.5_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] @@ -726,12 +725,12 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_0.5_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe0,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -739,26 +738,25 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe0,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe0,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0.5, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0.5 @@ -770,8 +768,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_0.5_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] @@ -782,12 +780,12 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_0.5_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe2,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -795,26 +793,25 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe2,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe2,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -0.5, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -0.5 @@ -826,8 +823,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_1.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] @@ -838,12 +835,12 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_1.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe4,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -851,26 +848,25 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe4,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe4,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 1.0 @@ -882,8 +878,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_1.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] @@ -894,12 +890,12 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_1.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe6,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -907,26 +903,25 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe6,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe6,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -1.0 @@ -938,8 +933,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_2.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] @@ -950,12 +945,12 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_2.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe8,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -963,26 +958,25 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe8,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe8,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 2.0 @@ -994,8 +988,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_2.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] @@ -1006,12 +1000,12 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_2.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xea,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1019,26 +1013,25 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xea,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xea,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -2.0 @@ -1050,8 +1043,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_4.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] @@ -1062,12 +1055,12 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_4.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xec,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1075,26 +1068,25 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xec,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xec,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 4.0 @@ -1106,8 +1098,8 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_4.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] @@ -1118,12 +1110,12 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_4.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xee,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1131,26 +1123,25 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xee,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xee,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -4.0 @@ -1161,7 +1152,7 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: commute_add_inline_imm_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1179,7 +1170,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; GFX11-LABEL: commute_add_inline_imm_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1199,7 +1190,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; VI-LABEL: commute_add_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1217,7 +1208,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; SI-LABEL: commute_add_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1243,7 +1234,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: commute_add_literal_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1261,7 +1252,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: commute_add_literal_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1281,7 +1272,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: commute_add_literal_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1299,7 +1290,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; SI-LABEL: commute_add_literal_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1326,8 +1317,8 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_1_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] @@ -1338,12 +1329,12 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_1_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x02,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1351,26 +1342,25 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 1 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x02,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 1 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x02,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x33800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0001 @@ -1382,8 +1372,8 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_2_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] @@ -1394,12 +1384,12 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_2_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x04,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1407,26 +1397,25 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 2 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x04,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 2 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x04,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_2_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x34000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0002 @@ -1438,8 +1427,8 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_16_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] @@ -1450,12 +1439,12 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_16_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x20,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1463,26 +1452,25 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_16_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 16 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x20,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 16 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x20,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_16_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x35800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0010 @@ -1493,7 +1481,7 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_1_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1511,7 +1499,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: add_inline_imm_neg_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1531,7 +1519,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: add_inline_imm_neg_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1549,7 +1537,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; SI-LABEL: add_inline_imm_neg_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1574,7 +1562,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_2_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1592,7 +1580,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: add_inline_imm_neg_2_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1612,7 +1600,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: add_inline_imm_neg_2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1630,7 +1618,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; SI-LABEL: add_inline_imm_neg_2_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1655,7 +1643,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_16_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1673,7 +1661,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: add_inline_imm_neg_16_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1693,7 +1681,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; VI-LABEL: add_inline_imm_neg_16_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1711,7 +1699,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; SI-LABEL: add_inline_imm_neg_16_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1737,8 +1725,8 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_63_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] @@ -1749,12 +1737,12 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_63_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x7e,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1762,26 +1750,25 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_63_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 63 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x7e,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 63 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x7e,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_63_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x367c0000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH003F @@ -1793,8 +1780,8 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_64_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] @@ -1805,12 +1792,12 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_64_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x80,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1818,26 +1805,25 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_64_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 64 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x80,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 64 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x80,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_64_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x36800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0040 diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll index 342d7b0237118d..ae51c3edf1c7e7 100644 --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -665,4 +665,4 @@ define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) { ret <2 x i16> %y } -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index b89dbd42e0466f..72f10ea892e53f 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 ; GFX8V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 @@ -33,8 +33,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 ; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 @@ -56,7 +56,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 @@ -80,7 +80,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 @@ -111,8 +111,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 -; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -123,8 +123,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc -; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xcc +; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -135,7 +135,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4 +; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4 ; GFX9V4-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 @@ -147,7 +147,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[6:7], 0x4 +; GFX9V5-NEXT: s_load_dword s2, s[4:5], 0x4 ; GFX9V5-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 @@ -165,8 +165,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 -; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -177,8 +177,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 -; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xc8 +; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -189,7 +189,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4 +; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4 ; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 @@ -201,7 +201,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[6:7], 0x4 +; GFX9V5-NEXT: s_load_dword s2, s[4:5], 0x4 ; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 @@ -219,12 +219,12 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { define amdgpu_kernel void @llvm_trap() { ; GFX8V4-LABEL: llvm_trap: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8V4-NEXT: s_trap 2 ; ; GFX8V5-LABEL: llvm_trap: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_trap 2 ; diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll index 7c8d89ef03b1b2..4c5c136f5333f3 100644 --- a/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes=amdgpu-attributor < %s | llc | FileCheck %s +; RUN: llc < %s | FileCheck %s target triple = "amdgcn-amd-amdhsa" diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index eb4cba35e9946e..47110d94918879 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -11,63 +11,59 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX9-LABEL: indirect_call_known_no_special_inputs: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 ; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, wobble@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, wobble@gotpcrel32@hi+12 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, snork@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, snork@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, 1, s15 -; GFX9-NEXT: s_cmp_eq_u32 s8, 1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_cselect_b32 s17, s21, s19 -; GFX9-NEXT: s_cselect_b32 s16, s20, s18 -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: s_cmp_eq_u32 s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_cselect_b32 s5, s13, s11 +; GFX9-NEXT: s_cselect_b32 s4, s12, s10 +; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: indirect_call_known_no_special_inputs: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_getpc_b64 s[6:7] -; GFX12-NEXT: s_sext_i32_i16 s7, s7 -; GFX12-NEXT: s_add_co_u32 s6, s6, snork@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+16 -; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], 0 -; GFX12-NEXT: s_getpc_b64 s[8:9] -; GFX12-NEXT: s_sext_i32_i16 s9, s9 -; GFX12-NEXT: s_add_co_u32 s8, s8, wobble@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s9, s9, wobble@gotpcrel32@hi+16 -; GFX12-NEXT: s_load_u8 s12, s[4:5], 0x0 -; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 -; GFX12-NEXT: s_load_b64 s[6:7], s[8:9], 0x0 +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, snork@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, snork@gotpcrel32@hi+16 +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_add_co_u32 s4, s4, wobble@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, wobble@gotpcrel32@hi+16 +; GFX12-NEXT: s_load_u8 s6, s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 ; GFX12-NEXT: v_mov_b32_e32 v31, v0 +; GFX12-NEXT: s_mov_b64 s[8:9], 0 ; GFX12-NEXT: s_mov_b32 s32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_and_b32 s8, 1, s12 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s8, 1 -; GFX12-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-NEXT: s_cselect_b32 s7, s7, s5 -; GFX12-NEXT: s_cselect_b32 s6, s6, s4 -; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-NEXT: s_and_b32 s4, 1, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s4, 1 +; GFX12-NEXT: s_cselect_b32 s1, s3, s1 +; GFX12-NEXT: s_cselect_b32 s0, s2, s0 +; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX12-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index f54a511eff7f1d..8183106b0ce9d4 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -27,6 +27,7 @@ define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[DUMMYRETURNBLOCK:%.*]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void +; entry: br label %loop @@ -39,10 +40,10 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_cbranch_execz .LBB1_3 ; SI-NEXT: ; %bb.1: ; %loop.preheader -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -66,6 +67,7 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[UNIFIEDRETURNBLOCK]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void +; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond = icmp eq i32 %tmp, 1 @@ -82,7 +84,7 @@ return: define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loops: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b64 s[2:3], -1 ; SI-NEXT: s_cbranch_scc1 .LBB2_4 ; SI-NEXT: ; %bb.1: @@ -129,6 +131,7 @@ define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP2]], label [[DUMMYRETURNBLOCK]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void +; entry: br i1 undef, label %loop1, label %loop2 @@ -145,10 +148,10 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_nest_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_cbranch_execz .LBB3_5 ; SI-NEXT: ; %bb.1: ; %outer_loop.preheader -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -189,6 +192,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 [[COND3]], label [[INNER_LOOP]], label [[OUTER_LOOP]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void +; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 4d62d30a38ed34..76b007c22b699c 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,15 +8,15 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %4 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %4 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() @@ -27,15 +27,15 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %4 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %4 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() @@ -46,15 +46,15 @@ define amdgpu_kernel void @v_input_output_i128() { define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %4 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %4 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %4 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll index e7a7b8a335d0d3..4fecdb576a6de3 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll @@ -6,20 +6,17 @@ ; GCN: define amdgpu_kernel void @caller(ptr addrspace(1) nocapture %p) local_unnamed_addr #1 { ; GCN: %mul.i = fmul float %load, 1.500000e+01 -; UNSAFE: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; UNSAFE: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; UNSAFE: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "unsafe-fp-math"="true" } +; UNSAFE: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="true" } -; NOINFS: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "no-infs-fp-math"="true" "uniform-work-group-size"="false" } -; NOINFS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="false" } +; NOINFS: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-infs-fp-math"="true" } +; NOINFS: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "unsafe-fp-math"="false" } -; NONANS: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "no-nans-fp-math"="true" "uniform-work-group-size"="false" } -; NONANS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="false" } - -declare void @extern() #0 +; NONANS: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-nans-fp-math"="true" } +; NONANS: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "unsafe-fp-math"="false" } define float @foo(float %x) #0 { entry: - call void @extern() %mul = fmul float %x, 1.500000e+01 ret float %mul } @@ -27,7 +24,7 @@ entry: define amdgpu_kernel void @caller(ptr addrspace(1) %p) #1 { entry: %load = load float, ptr addrspace(1) %p, align 4 - %call = call fast float @foo(float %load) + %call = call fast float @foo(float %load) #0 store float %call, ptr addrspace(1) %p, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll index 807a7d26f49e53..46b2eb30c791c7 100644 --- a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll @@ -10,7 +10,7 @@ entry: } ; GCN-LABEL: {{^}}inline_asm_input_v2f16: -; GCN: s_mov_b32 s2, s{{[0-9]+}} +; GCN: s_mov_b32 s0, s{{[0-9]+}} define amdgpu_kernel void @inline_asm_input_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { entry: %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x half> %in) #0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index b62bf890e65fe1..f736ca7cd625a3 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -4,22 +4,22 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) { ; GCN-LABEL: float4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float4_inselt_undef: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v2, v0 @@ -56,23 +56,23 @@ entry: define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) { ; GCN-LABEL: int4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s8, 3 -; GCN-NEXT: s_cselect_b32 s2, s7, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 -; GCN-NEXT: s_cselect_b32 s3, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cselect_b32 s3, s7, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cselect_b32 s6, s6, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, s4, 1 ; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -85,15 +85,15 @@ entry: define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) { ; GCN-LABEL: float2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc @@ -109,21 +109,21 @@ entry: define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec, i32 %sel) { ; GCN-LABEL: float8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GCN-NEXT: s_load_dword s12, s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 ; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 ; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: s_mov_b32 m0, s12 ; GCN-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v8, s2 @@ -142,14 +142,14 @@ entry: define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %vec, i32 %sel) { ; GCN-LABEL: float16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s20, s[0:1], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_add_u32 s2, s0, 48 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 +; GCN-NEXT: s_add_u32 s0, s2, 48 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 @@ -166,24 +166,24 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-NEXT: v_mov_b32_e32 v14, s18 ; GCN-NEXT: v_mov_b32_e32 v15, s19 ; GCN-NEXT: s_mov_b32 m0, s20 -; GCN-NEXT: v_mov_b32_e32 v16, s2 -; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: s_add_u32 s0, s2, 32 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 -; GCN-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v13, s1 +; GCN-NEXT: v_mov_b32_e32 v12, s0 +; GCN-NEXT: s_add_u32 s0, s2, 16 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s3 -; GCN-NEXT: v_mov_b32_e32 v8, s2 +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v8, s0 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -195,18 +195,18 @@ entry: define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %vec, i32 %sel) { ; GCN-LABEL: float32_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x124 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v33, s3 +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_add_u32 s0, s2, 0x70 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v33, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 ; GCN-NEXT: v_mov_b32_e32 v5, s41 @@ -236,48 +236,48 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: v_mov_b32_e32 v29, s17 ; GCN-NEXT: v_mov_b32_e32 v30, s18 ; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: v_mov_b32_e32 v32, s2 -; GCN-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-NEXT: v_mov_b32_e32 v32, s0 +; GCN-NEXT: s_add_u32 s0, s2, 0x60 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v29, s3 -; GCN-NEXT: v_mov_b32_e32 v28, s2 -; GCN-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v29, s1 +; GCN-NEXT: v_mov_b32_e32 v28, s0 +; GCN-NEXT: s_add_u32 s0, s2, 0x50 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v25, s3 -; GCN-NEXT: v_mov_b32_e32 v24, s2 -; GCN-NEXT: s_add_u32 s2, s0, 64 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v25, s1 +; GCN-NEXT: v_mov_b32_e32 v24, s0 +; GCN-NEXT: s_add_u32 s0, s2, 64 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v21, s3 -; GCN-NEXT: v_mov_b32_e32 v20, s2 -; GCN-NEXT: s_add_u32 s2, s0, 48 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v21, s1 +; GCN-NEXT: v_mov_b32_e32 v20, s0 +; GCN-NEXT: s_add_u32 s0, s2, 48 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 -; GCN-NEXT: v_mov_b32_e32 v16, s2 -; GCN-NEXT: s_add_u32 s2, s0, 32 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: s_add_u32 s0, s2, 32 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 -; GCN-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v13, s1 +; GCN-NEXT: v_mov_b32_e32 v12, s0 +; GCN-NEXT: s_add_u32 s0, s2, 16 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s3 -; GCN-NEXT: v_mov_b32_e32 v8, s2 +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v8, s0 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -289,8 +289,8 @@ entry: define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -314,7 +314,7 @@ entry: define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) { ; GCN-LABEL: half2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -335,49 +335,49 @@ entry: define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, i32 %sel) { ; GCN-LABEL: half8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 7 -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: s_lshr_b32 s3, s7, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 6 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 16 +; GCN-NEXT: s_lshr_b32 s3, s6, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 5 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 ; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 16 +; GCN-NEXT: s_lshr_b32 s3, s5, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-NEXT: s_lshr_b32 s3, s4, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -397,7 +397,7 @@ entry: define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) { ; GCN-LABEL: short2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -418,8 +418,8 @@ entry: define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -443,8 +443,8 @@ entry: define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s4, s4, 3 ; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 @@ -467,99 +467,99 @@ entry: define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s7, 24 -; GCN-NEXT: s_cmp_lg_u32 s8, 15 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_lshr_b32 s3, s7, 24 +; GCN-NEXT: s_cmp_lg_u32 s2, 15 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 14 +; GCN-NEXT: s_lshr_b32 s3, s7, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 14 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s7, 8 +; GCN-NEXT: s_lshr_b32 s3, s7, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 13 +; GCN-NEXT: s_cmp_lg_u32 s2, 13 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 12 +; GCN-NEXT: s_cmp_lg_u32 s2, 12 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s2, s6, 24 +; GCN-NEXT: s_lshr_b32 s3, s6, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s8, 11 +; GCN-NEXT: s_cmp_lg_u32 s2, 11 ; GCN-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 10 +; GCN-NEXT: s_lshr_b32 s3, s6, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 10 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 8 +; GCN-NEXT: s_lshr_b32 s3, s6, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 9 +; GCN-NEXT: s_cmp_lg_u32 s2, 9 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 8 +; GCN-NEXT: s_cmp_lg_u32 s2, 8 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s2, s5, 24 +; GCN-NEXT: s_lshr_b32 s3, s5, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s8, 7 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 ; GCN-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 6 +; GCN-NEXT: s_lshr_b32 s3, s5, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 8 +; GCN-NEXT: s_lshr_b32 s3, s5, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 5 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_lshr_b32 s2, s4, 24 +; GCN-NEXT: s_lshr_b32 s3, s4, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_lshr_b32 s3, s4, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 8 +; GCN-NEXT: s_lshr_b32 s3, s4, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -580,21 +580,21 @@ entry: define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) { ; GCN-LABEL: double2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s8, 1 -; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s7 -; GCN-NEXT: s_cselect_b32 s3, 0, s6 -; GCN-NEXT: s_cmp_eq_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s7 +; GCN-NEXT: s_cselect_b32 s6, 0, s6 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s5 ; GCN-NEXT: s_cselect_b32 s4, 0, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -607,10 +607,10 @@ entry: define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) { ; GCN-LABEL: double5_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s12, s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x84 -; GCN-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84 +; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s12, 4 ; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9 @@ -661,12 +661,12 @@ entry: define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) { ; GCN-LABEL: double8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s20, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 @@ -717,17 +717,17 @@ entry: define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %vec, i32 %sel) { ; GCN-LABEL: double7_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x94 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x84 -; GCN-NEXT: s_load_dword s2, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x94 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x84 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 @@ -738,25 +738,25 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NEXT: v_mov_b32_e32 v12, s16 ; GCN-NEXT: v_mov_b32_e32 v13, s17 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 -; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: s_add_u32 s0, s2, 16 ; GCN-NEXT: v_movreld_b32_e32 v1, v16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v15, s3 -; GCN-NEXT: v_mov_b32_e32 v14, s2 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v15, s1 +; GCN-NEXT: v_mov_b32_e32 v14, s0 ; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-NEXT: s_add_u32 s2, s0, 48 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_add_u32 s0, s2, 48 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_add_u32 s0, s0, 32 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_add_u32 s0, s2, 32 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13] -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -770,15 +770,14 @@ entry: define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> %vec, i32 %sel) { ; GCN-LABEL: double16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x124 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: s_lshl_b32 s0, s0, 1 -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 @@ -810,7 +809,7 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_mov_b32_e32 v29, s17 ; GCN-NEXT: v_mov_b32_e32 v30, s18 ; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -868,22 +867,20 @@ entry: define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> %vec, i32 %sel) { ; GCN-LABEL: double15_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x114 -; GCN-NEXT: s_load_dwordx4 s[20:23], s[2:3], 0x104 -; GCN-NEXT: s_load_dwordx8 s[24:31], s[2:3], 0xe4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x114 +; GCN-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x104 +; GCN-NEXT: s_load_dwordx8 s[24:31], s[0:1], 0xe4 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_load_dword s4, s[2:3], 0x124 -; GCN-NEXT: v_mov_b32_e32 v28, s0 -; GCN-NEXT: v_mov_b32_e32 v29, s1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x124 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v28, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s4, 1 -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_lshl_b32 s2, s4, 1 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 @@ -909,8 +906,9 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v25, s21 ; GCN-NEXT: v_mov_b32_e32 v26, s22 ; GCN-NEXT: v_mov_b32_e32 v27, s23 +; GCN-NEXT: v_mov_b32_e32 v29, s3 +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NEXT: v_movreld_b32_e32 v1, v32 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -964,13 +962,13 @@ entry: define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: s_mov_b32 s15, 0xe80000 -; GCN-NEXT: s_add_u32 s12, s12, s9 -; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xe80000 +; GCN-NEXT: s_add_u32 s4, s4, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_addc_u32 s5, s5, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 3 @@ -982,16 +980,16 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_and_b32_e32 v3, 3, v3 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 -; GCN-NEXT: buffer_store_byte v4, off, s[12:15], 0 offset:3 -; GCN-NEXT: buffer_store_byte v3, off, s[12:15], 0 offset:2 -; GCN-NEXT: buffer_store_byte v2, off, s[12:15], 0 offset:1 +; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; GCN-NEXT: buffer_store_byte v4, off, s[4:7], 0 offset:3 +; GCN-NEXT: buffer_store_byte v3, off, s[4:7], 0 offset:2 +; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: buffer_store_byte v1, v0, s[12:15], 0 offen -; GCN-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 -; GCN-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:1 -; GCN-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2 -; GCN-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 offset:3 +; GCN-NEXT: buffer_store_byte v1, v0, s[4:7], 0 offen +; GCN-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:1 +; GCN-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:2 +; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:3 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt vmcnt(2) @@ -1019,11 +1017,11 @@ entry: define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit128_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x44 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s3, s4, 24 +; GCN-NEXT: s_lshr_b32 s1, s4, 24 ; GCN-NEXT: s_lshr_b32 s8, s4, 16 ; GCN-NEXT: s_lshr_b32 s9, s4, 17 ; GCN-NEXT: s_lshr_b32 s10, s4, 18 @@ -1059,10 +1057,10 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_lshr_b32 s41, s7, 21 ; GCN-NEXT: s_lshr_b32 s42, s7, 22 ; GCN-NEXT: s_lshr_b32 s43, s7, 23 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x77 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x77 ; GCN-NEXT: v_mov_b32_e32 v15, s43 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x76 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x76 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s42 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1070,11 +1068,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x75 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x75 ; GCN-NEXT: v_or_b32_e32 v15, v15, v18 ; GCN-NEXT: v_mov_b32_e32 v18, s41 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x74 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x74 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s40 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1083,11 +1081,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x73 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x73 ; GCN-NEXT: v_or_b32_e32 v15, v18, v15 ; GCN-NEXT: v_mov_b32_e32 v18, s39 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x72 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x72 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s38 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1095,11 +1093,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x71 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x71 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_mov_b32_e32 v19, s37 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x70 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x70 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_mov_b32_e32 v20, s36 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1111,11 +1109,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7f ; GCN-NEXT: v_or_b32_e32 v15, v18, v15 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s35 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1123,11 +1121,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1136,22 +1134,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x78 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x78 ; GCN-NEXT: v_mov_b32_e32 v13, s35 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x79 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x79 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s35 ; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc @@ -1166,11 +1164,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v19, v19, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6f ; GCN-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v18, 15, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 14, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1178,11 +1176,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 13, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 12, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1191,11 +1189,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 10, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1203,11 +1201,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x69 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x69 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 9, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x68 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x68 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 8, s7 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1219,11 +1217,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v17, v17, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x67 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x67 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x66 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x66 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1231,11 +1229,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x65 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x65 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x64 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x64 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1244,11 +1242,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x63 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x63 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x62 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x62 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1256,11 +1254,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x61 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x61 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x60 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x60 ; GCN-NEXT: v_mov_b32_e32 v16, s7 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1273,11 +1271,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 4, v18 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 ; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x57 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x57 ; GCN-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v17, s34 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x56 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x56 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s33 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1285,11 +1283,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x55 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x55 ; GCN-NEXT: v_or_b32_e32 v17, v17, v18 ; GCN-NEXT: v_mov_b32_e32 v18, s31 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x54 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x54 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s30 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1298,11 +1296,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x53 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x53 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_mov_b32_e32 v18, s29 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x52 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x52 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s28 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1310,11 +1308,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x51 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x51 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_mov_b32_e32 v19, s27 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x50 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x50 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_mov_b32_e32 v20, s26 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1326,11 +1324,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 4, v17 ; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5f ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s25 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1338,11 +1336,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1351,22 +1349,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x58 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x58 ; GCN-NEXT: v_mov_b32_e32 v3, s25 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x59 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x59 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s25 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc @@ -1380,11 +1378,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v3, v18, v3 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4f ; GCN-NEXT: v_or_b32_sdwa v17, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v3, 15, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4e ; GCN-NEXT: v_lshrrev_b16_e64 v18, 14, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1392,11 +1390,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4d ; GCN-NEXT: v_or_b32_e32 v3, v3, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 13, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4c ; GCN-NEXT: v_lshrrev_b16_e64 v19, 12, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1405,11 +1403,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4b ; GCN-NEXT: v_or_b32_e32 v3, v18, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 11, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4a ; GCN-NEXT: v_lshrrev_b16_e64 v19, 10, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1417,11 +1415,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x49 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x49 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 9, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x48 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x48 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 8, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1433,11 +1431,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 12, v3 ; GCN-NEXT: v_and_b32_sdwa v18, v18, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x47 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x47 ; GCN-NEXT: v_or_b32_e32 v18, v3, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 7, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x46 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x46 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1445,11 +1443,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x45 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x45 ; GCN-NEXT: v_or_b32_e32 v3, v3, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x44 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x44 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1458,11 +1456,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x43 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x43 ; GCN-NEXT: v_or_b32_e32 v19, v19, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 3, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x42 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x42 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1470,11 +1468,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x41 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x41 ; GCN-NEXT: v_or_b32_e32 v3, v3, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 64 +; GCN-NEXT: s_cmp_lg_u32 s0, 64 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1487,11 +1485,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v19 ; GCN-NEXT: v_and_b32_e32 v2, 15, v2 -; GCN-NEXT: s_cmp_lg_u32 s2, 55 +; GCN-NEXT: s_cmp_lg_u32 s0, 55 ; GCN-NEXT: v_or_b32_e32 v2, v2, v15 ; GCN-NEXT: v_mov_b32_e32 v15, s24 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 54 +; GCN-NEXT: s_cmp_lg_u32 s0, 54 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v16, s23 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1499,12 +1497,12 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 53 +; GCN-NEXT: s_cmp_lg_u32 s0, 53 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_mov_b32_e32 v16, s22 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 52 +; GCN-NEXT: s_cmp_lg_u32 s0, 52 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s21 @@ -1514,11 +1512,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 51 +; GCN-NEXT: s_cmp_lg_u32 s0, 51 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_mov_b32_e32 v16, s20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 50 +; GCN-NEXT: s_cmp_lg_u32 s0, 50 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1526,11 +1524,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 49 +; GCN-NEXT: s_cmp_lg_u32 s0, 49 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_mov_b32_e32 v17, s18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 48 +; GCN-NEXT: s_cmp_lg_u32 s0, 48 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1542,11 +1540,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 63 +; GCN-NEXT: s_cmp_lg_u32 s0, 63 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 62 +; GCN-NEXT: s_cmp_lg_u32 s0, 62 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s16 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1554,11 +1552,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 61 +; GCN-NEXT: s_cmp_lg_u32 s0, 61 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 60 +; GCN-NEXT: s_cmp_lg_u32 s0, 60 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1567,22 +1565,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 59 +; GCN-NEXT: s_cmp_lg_u32 s0, 59 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 58 +; GCN-NEXT: s_cmp_lg_u32 s0, 58 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: s_cmp_lg_u32 s2, 56 +; GCN-NEXT: s_cmp_lg_u32 s0, 56 ; GCN-NEXT: v_mov_b32_e32 v14, s16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 57 +; GCN-NEXT: s_cmp_lg_u32 s0, 57 ; GCN-NEXT: v_or_b32_e32 v17, v17, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 1, s16 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc @@ -1596,11 +1594,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 ; GCN-NEXT: v_and_b32_sdwa v14, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v14, v16, v14 -; GCN-NEXT: s_cmp_lg_u32 s2, 47 +; GCN-NEXT: s_cmp_lg_u32 s0, 47 ; GCN-NEXT: v_or_b32_sdwa v15, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v14, 15, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 46 +; GCN-NEXT: s_cmp_lg_u32 s0, 46 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 14, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1608,11 +1606,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 45 +; GCN-NEXT: s_cmp_lg_u32 s0, 45 ; GCN-NEXT: v_or_b32_e32 v14, v14, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 13, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 44 +; GCN-NEXT: s_cmp_lg_u32 s0, 44 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 12, s5 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1621,11 +1619,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 43 +; GCN-NEXT: s_cmp_lg_u32 s0, 43 ; GCN-NEXT: v_or_b32_e32 v14, v16, v14 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 11, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 42 +; GCN-NEXT: s_cmp_lg_u32 s0, 42 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 10, s5 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1633,11 +1631,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 41 +; GCN-NEXT: s_cmp_lg_u32 s0, 41 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 9, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 40 +; GCN-NEXT: s_cmp_lg_u32 s0, 40 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 8, s5 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1649,11 +1647,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GCN-NEXT: v_and_b32_sdwa v16, v16, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 39 +; GCN-NEXT: s_cmp_lg_u32 s0, 39 ; GCN-NEXT: v_or_b32_e32 v16, v14, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 7, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 38 +; GCN-NEXT: s_cmp_lg_u32 s0, 38 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1661,11 +1659,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 37 +; GCN-NEXT: s_cmp_lg_u32 s0, 37 ; GCN-NEXT: v_or_b32_e32 v14, v14, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 36 +; GCN-NEXT: s_cmp_lg_u32 s0, 36 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s5 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1674,11 +1672,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 35 +; GCN-NEXT: s_cmp_lg_u32 s0, 35 ; GCN-NEXT: v_or_b32_e32 v17, v17, v14 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 3, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 34 +; GCN-NEXT: s_cmp_lg_u32 s0, 34 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1686,11 +1684,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmp_lg_u32 s2, 33 +; GCN-NEXT: s_cmp_lg_u32 s0, 33 ; GCN-NEXT: v_or_b32_e32 v18, v14, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 1, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 32 +; GCN-NEXT: s_cmp_lg_u32 s0, 32 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1704,11 +1702,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v1, 15, v1 ; GCN-NEXT: v_or_b32_e32 v1, v1, v17 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 23 +; GCN-NEXT: s_cmp_lg_u32 s0, 23 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v15, s15 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 22 +; GCN-NEXT: s_cmp_lg_u32 s0, 22 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v16, s14 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1716,11 +1714,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 21 +; GCN-NEXT: s_cmp_lg_u32 s0, 21 ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_mov_b32_e32 v16, s13 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 20 +; GCN-NEXT: s_cmp_lg_u32 s0, 20 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s12 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1729,11 +1727,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 19 +; GCN-NEXT: s_cmp_lg_u32 s0, 19 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_mov_b32_e32 v16, s11 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 18 +; GCN-NEXT: s_cmp_lg_u32 s0, 18 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s10 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1741,11 +1739,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 17 +; GCN-NEXT: s_cmp_lg_u32 s0, 17 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_mov_b32_e32 v17, s9 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 16 +; GCN-NEXT: s_cmp_lg_u32 s0, 16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s8 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1757,24 +1755,24 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 31 +; GCN-NEXT: s_cmp_lg_u32 s0, 31 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 30 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 30 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s1 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 29 +; GCN-NEXT: s_cmp_lg_u32 s0, 29 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 28 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 28 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s1 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc @@ -1782,24 +1780,24 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v17, v19, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 27 +; GCN-NEXT: s_cmp_lg_u32 s0, 27 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 26 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 26 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s1 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: s_cmp_lg_u32 s2, 24 -; GCN-NEXT: v_mov_b32_e32 v18, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 24 +; GCN-NEXT: v_mov_b32_e32 v18, s1 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 25 +; GCN-NEXT: s_cmp_lg_u32 s0, 25 ; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s1 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc @@ -1811,11 +1809,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 ; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 15 +; GCN-NEXT: s_cmp_lg_u32 s0, 15 ; GCN-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v16, 15, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 14 +; GCN-NEXT: s_cmp_lg_u32 s0, 14 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 14, s4 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1823,11 +1821,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 13 +; GCN-NEXT: s_cmp_lg_u32 s0, 13 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 13, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 12 +; GCN-NEXT: s_cmp_lg_u32 s0, 12 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 12, s4 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1835,52 +1833,52 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 11 +; GCN-NEXT: s_cmp_lg_u32 s0, 11 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s4 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 10 +; GCN-NEXT: s_cmp_lg_u32 s0, 10 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 10, s4 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 9 +; GCN-NEXT: s_cmp_lg_u32 s0, 9 ; GCN-NEXT: v_lshrrev_b16_e64 v12, 9, s4 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 8 +; GCN-NEXT: s_cmp_lg_u32 s0, 8 ; GCN-NEXT: v_lshrrev_b16_e64 v11, 8, s4 ; GCN-NEXT: v_cndmask_b32_e32 v12, 1, v12, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 7 +; GCN-NEXT: s_cmp_lg_u32 s0, 7 ; GCN-NEXT: v_lshrrev_b16_e64 v10, 7, s4 ; GCN-NEXT: v_cndmask_b32_e32 v11, 1, v11, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 6 +; GCN-NEXT: s_cmp_lg_u32 s0, 6 ; GCN-NEXT: v_lshrrev_b16_e64 v9, 6, s4 ; GCN-NEXT: v_cndmask_b32_e32 v10, 1, v10, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 5 +; GCN-NEXT: s_cmp_lg_u32 s0, 5 ; GCN-NEXT: v_lshrrev_b16_e64 v8, 5, s4 ; GCN-NEXT: v_cndmask_b32_e32 v9, 1, v9, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 4 +; GCN-NEXT: s_cmp_lg_u32 s0, 4 ; GCN-NEXT: v_lshrrev_b16_e64 v7, 4, s4 ; GCN-NEXT: v_cndmask_b32_e32 v8, 1, v8, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s0, 3 ; GCN-NEXT: v_lshrrev_b16_e64 v6, 3, s4 ; GCN-NEXT: v_cndmask_b32_e32 v7, 1, v7, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cmp_lg_u32 s0, 2 ; GCN-NEXT: v_lshrrev_b16_e64 v5, 2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v6, 1, v6, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 1 ; GCN-NEXT: v_lshrrev_b16_e64 v4, 1, s4 ; GCN-NEXT: v_cndmask_b32_e32 v5, 1, v5, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1919,9 +1917,9 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v11, v16, v11 ; GCN-NEXT: v_or_b32_e32 v0, v0, v7 ; GCN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 2a8eac8712e52a..68427e8937bb94 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: insertelement_v2f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 @@ -40,7 +40,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -53,7 +53,7 @@ define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: insertelement_v2f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -71,7 +71,7 @@ define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -84,7 +84,7 @@ define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: insertelement_v2i32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -102,7 +102,7 @@ define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32 define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x3e7 @@ -115,7 +115,7 @@ define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: insertelement_v2i32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e7 @@ -135,8 +135,8 @@ define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32 define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -150,8 +150,8 @@ define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -170,8 +170,8 @@ define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s1, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -185,8 +185,8 @@ define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s1, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -205,8 +205,8 @@ define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s2, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -220,8 +220,8 @@ define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s2, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -240,8 +240,8 @@ define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -255,8 +255,8 @@ define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -275,8 +275,8 @@ define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32> %a) nounwind { ; SI-LABEL: insertelement_v4i32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_movk_i32 s0, 0x3e7 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -290,8 +290,8 @@ define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32 ; ; VI-LABEL: insertelement_v4i32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_movk_i32 s0, 0x3e7 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -310,8 +310,8 @@ define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -323,8 +323,8 @@ define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x flo ; ; VI-LABEL: insertelement_v3f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -341,8 +341,8 @@ define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x flo define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 @@ -354,8 +354,8 @@ define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x flo ; ; VI-LABEL: insertelement_v3f32_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 @@ -497,8 +497,8 @@ define <12 x float> @insertelement_to_v12f32_undef() nounwind { define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -516,8 +516,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -540,9 +540,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x4 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -564,9 +564,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -593,9 +593,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x4 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -621,9 +621,9 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 ; ; VI-LABEL: dynamic_insertelement_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -654,9 +654,9 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x10 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -677,9 +677,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x40 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -705,10 +705,10 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v9f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x18 -; SI-NEXT: s_load_dword s5, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dword s6, s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: v_mov_b32_e32 v9, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -720,8 +720,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_mov_b32 m0, s5 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_movreld_b32_e32 v0, v9 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 @@ -731,10 +731,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 ; ; VI-LABEL: dynamic_insertelement_v9f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x60 -; VI-NEXT: s_load_dword s5, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v9, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -745,8 +745,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: s_mov_b32 m0, s5 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, v9 @@ -762,10 +762,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v10f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 -; SI-NEXT: s_load_dword s6, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: v_mov_b32_e32 v10, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -777,9 +777,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: v_mov_b32_e32 v9, s5 -; SI-NEXT: s_mov_b32 m0, s6 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s7 +; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_movreld_b32_e32 v0, v10 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -789,10 +789,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v10f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x60 -; VI-NEXT: s_load_dword s6, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v10, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -803,9 +803,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: v_mov_b32_e32 v9, s5 -; VI-NEXT: s_mov_b32 m0, s6 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v9, s7 +; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, v10 @@ -821,10 +821,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, <11 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v11f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: v_mov_b32_e32 v11, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -849,8 +849,8 @@ define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v11f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v11, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -859,8 +859,8 @@ define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -883,10 +883,10 @@ define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, <12 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v12f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: v_mov_b32_e32 v12, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -912,8 +912,8 @@ define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v12f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v12, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -922,8 +922,8 @@ define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -947,9 +947,9 @@ define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, <16 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -980,9 +980,9 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v16f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1018,8 +1018,8 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1034,8 +1034,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1055,9 +1055,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1075,9 +1075,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1100,10 +1100,10 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { ; SI-LABEL: dynamic_insertelement_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dword s9, s[6:7], 0x11 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dword s9, s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1124,10 +1124,10 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 ; ; VI-LABEL: dynamic_insertelement_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dword s9, s[6:7], 0x44 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dword s9, s[4:5], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1153,9 +1153,9 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x10 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1175,9 +1175,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x40 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1202,10 +1202,10 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v9i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x18 -; SI-NEXT: s_load_dword s5, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dword s6, s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1217,8 +1217,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_mov_b32 m0, s5 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -1227,10 +1227,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 ; ; VI-LABEL: dynamic_insertelement_v9i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x60 -; VI-NEXT: s_load_dword s5, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1242,8 +1242,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: s_mov_b32 m0, s5 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -1257,10 +1257,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v10i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 -; SI-NEXT: s_load_dword s6, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1272,9 +1272,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: v_mov_b32_e32 v9, s5 -; SI-NEXT: s_mov_b32 m0, s6 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s7 +; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1283,10 +1283,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v10i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x60 -; VI-NEXT: s_load_dword s6, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -1297,9 +1297,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: v_mov_b32_e32 v9, s5 -; VI-NEXT: s_mov_b32 m0, s6 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v9, s7 +; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -1314,10 +1314,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, <11 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v11i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1341,17 +1341,17 @@ define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v11i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -1374,10 +1374,10 @@ define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, <12 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v12i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1402,17 +1402,17 @@ define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v12i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -1436,9 +1436,9 @@ define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, <16 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 +; SI-NEXT: s_load_dword s6, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1458,7 +1458,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < ; SI-NEXT: v_mov_b32_e32 v13, s21 ; SI-NEXT: v_mov_b32_e32 v14, s22 ; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_mov_b32 m0, s6 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 @@ -1468,9 +1468,9 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v16i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1490,7 +1490,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v13, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 ; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 m0, s6 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 @@ -1505,7 +1505,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1522,7 +1522,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1544,8 +1544,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s8, s[6:7], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1565,8 +1565,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s8, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1592,33 +1592,33 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s5, s[6:7], 0xa +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, 0xff, s4 -; SI-NEXT: s_andn2_b32 s5, s5, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 3 +; SI-NEXT: s_lshl_b32 s5, 0xff, s5 +; SI-NEXT: s_andn2_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s5, 0x505 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[6:7], 0x28 +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x28 ; VI-NEXT: v_mov_b32_e32 v0, 0xff ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: v_lshlrev_b16_e32 v0, s4, v0 +; VI-NEXT: s_lshl_b32 s5, s6, 3 +; VI-NEXT: v_lshlrev_b16_e32 v0, s5, v0 ; VI-NEXT: v_not_b32_e32 v1, v0 -; VI-NEXT: v_and_b32_e32 v1, s5, v1 +; VI-NEXT: v_and_b32_e32 v1, s4, v1 ; VI-NEXT: v_and_b32_e32 v0, 0x505, v0 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1634,17 +1634,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s5, s[6:7], 0xa +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, 0xff, s4 -; SI-NEXT: s_andn2_b32 s5, s5, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x5050505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 3 +; SI-NEXT: s_lshl_b32 s5, 0xff, s5 +; SI-NEXT: s_andn2_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s5, 0x5050505 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1654,17 +1654,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v3i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[6:7], 0x28 +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: s_lshl_b32 s4, 0xff, s4 -; VI-NEXT: s_andn2_b32 s5, s5, s4 -; VI-NEXT: s_and_b32 s4, s4, 0x5050505 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s6, 3 +; VI-NEXT: s_lshl_b32 s5, 0xff, s5 +; VI-NEXT: s_andn2_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s5, 0x5050505 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1679,34 +1679,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s5, s[6:7], 0xa +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, 0xff, s4 -; SI-NEXT: s_andn2_b32 s5, s5, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x5050505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 3 +; SI-NEXT: s_lshl_b32 s5, 0xff, s5 +; SI-NEXT: s_andn2_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s5, 0x5050505 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[6:7], 0x28 +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: s_lshl_b32 s4, 0xff, s4 -; VI-NEXT: s_andn2_b32 s5, s5, s4 -; VI-NEXT: s_and_b32 s4, s4, 0x5050505 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s6, 3 +; VI-NEXT: s_lshl_b32 s5, 0xff, s5 +; VI-NEXT: s_andn2_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s5, 0x5050505 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1718,46 +1718,46 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %a.ptr, i32 %b) nounwind { ; SI-LABEL: s_dynamic_insertelement_v8i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_lshl_b32 s0, s4, 3 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_lshl_b32 s0, s8, 3 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 -; SI-NEXT: s_and_b32 s5, s1, 0x5050505 +; SI-NEXT: s_and_b32 s9, s1, 0x5050505 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] -; SI-NEXT: s_and_b32 s4, s0, 0x5050505 -; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3] +; SI-NEXT: s_and_b32 s8, s0, 0x5050505 +; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_dynamic_insertelement_v8i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_mov_b32 s11, 0x1100f000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_lshl_b32 s0, s4, 3 -; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_lshl_b32 s0, s8, 3 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 -; VI-NEXT: s_and_b32 s5, s1, 0x5050505 +; VI-NEXT: s_and_b32 s9, s1, 0x5050505 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] -; VI-NEXT: s_and_b32 s4, s0, 0x5050505 -; VI-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3] +; VI-NEXT: s_and_b32 s8, s0, 0x5050505 +; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %a = load <8 x i8>, ptr addrspace(4) %a.ptr, align 4 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b @@ -1768,196 +1768,196 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <16 x i8> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x4 -; SI-NEXT: s_load_dword s4, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 +; SI-NEXT: s_load_dword s6, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s5, s11, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 15 +; SI-NEXT: s_lshr_b32 s4, s11, 24 +; SI-NEXT: s_cmp_lg_u32 s6, 15 +; SI-NEXT: s_cselect_b32 s4, s4, 5 +; SI-NEXT: s_lshl_b32 s4, s4, 24 +; SI-NEXT: s_lshr_b32 s5, s11, 16 +; SI-NEXT: s_cmp_lg_u32 s6, 14 ; SI-NEXT: s_cselect_b32 s5, s5, 5 -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_lshr_b32 s6, s11, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 14 -; SI-NEXT: s_cselect_b32 s6, s6, 5 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_lshr_b32 s6, s11, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 13 -; SI-NEXT: s_cselect_b32 s6, s6, 5 -; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 12 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshr_b32 s5, s11, 8 +; SI-NEXT: s_cmp_lg_u32 s6, 13 +; SI-NEXT: s_cselect_b32 s5, s5, 5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_cmp_lg_u32 s6, 12 ; SI-NEXT: s_cselect_b32 s7, s11, 5 ; SI-NEXT: s_and_b32 s7, s7, 0xff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_lshr_b32 s6, s10, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 11 -; SI-NEXT: s_cselect_b32 s6, s6, 5 -; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshr_b32 s5, s10, 24 +; SI-NEXT: s_cmp_lg_u32 s6, 11 +; SI-NEXT: s_cselect_b32 s5, s5, 5 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshr_b32 s7, s10, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 10 +; SI-NEXT: s_cmp_lg_u32 s6, 10 ; SI-NEXT: s_cselect_b32 s7, s7, 5 ; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_lshr_b32 s7, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 9 +; SI-NEXT: s_cmp_lg_u32 s6, 9 ; SI-NEXT: s_cselect_b32 s7, s7, 5 ; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 8 +; SI-NEXT: s_cmp_lg_u32 s6, 8 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_or_b32 s7, s10, s7 ; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s7, s5 ; SI-NEXT: s_lshr_b32 s7, s9, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 7 +; SI-NEXT: s_cmp_lg_u32 s6, 7 ; SI-NEXT: s_cselect_b32 s7, s7, 5 ; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_lshr_b32 s10, s9, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 6 +; SI-NEXT: s_cmp_lg_u32 s6, 6 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_or_b32 s7, s7, s10 ; SI-NEXT: s_lshr_b32 s10, s9, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 5 +; SI-NEXT: s_cmp_lg_u32 s6, 5 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 4 +; SI-NEXT: s_cmp_lg_u32 s6, 4 ; SI-NEXT: s_cselect_b32 s9, s9, 5 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_lshr_b32 s9, s8, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 3 +; SI-NEXT: s_cmp_lg_u32 s6, 3 ; SI-NEXT: s_cselect_b32 s9, s9, 5 ; SI-NEXT: s_lshl_b32 s9, s9, 24 ; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 2 +; SI-NEXT: s_cmp_lg_u32 s6, 2 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_lshr_b32 s10, s8, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cmp_lg_u32 s6, 1 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_cselect_b32 s4, s8, 5 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_or_b32 s4, s4, s10 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s4, s4, s9 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_cmp_lg_u32 s6, 0 +; SI-NEXT: s_cselect_b32 s6, s8, 5 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s9 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x10 -; VI-NEXT: s_load_dword s4, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s11, 24 -; VI-NEXT: s_cmp_lg_u32 s4, 15 -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: s_lshr_b32 s4, s11, 24 +; VI-NEXT: s_cmp_lg_u32 s6, 15 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s11, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 14 +; VI-NEXT: s_lshr_b32 s4, s11, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 14 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s11, 8 +; VI-NEXT: s_lshr_b32 s4, s11, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 13 +; VI-NEXT: s_cmp_lg_u32 s6, 13 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 12 +; VI-NEXT: s_cmp_lg_u32 s6, 12 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; VI-NEXT: s_lshr_b32 s5, s10, 24 +; VI-NEXT: s_lshr_b32 s4, s10, 24 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s4, 11 +; VI-NEXT: s_cmp_lg_u32 s6, 11 ; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s10, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 10 +; VI-NEXT: s_lshr_b32 s4, s10, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 10 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s10, 8 +; VI-NEXT: s_lshr_b32 s4, s10, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 9 +; VI-NEXT: s_cmp_lg_u32 s6, 9 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 8 +; VI-NEXT: s_cmp_lg_u32 s6, 8 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; VI-NEXT: s_lshr_b32 s5, s9, 24 +; VI-NEXT: s_lshr_b32 s4, s9, 24 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s4, 7 +; VI-NEXT: s_cmp_lg_u32 s6, 7 ; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s9, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 6 +; VI-NEXT: s_lshr_b32 s4, s9, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 6 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s9, 8 +; VI-NEXT: s_lshr_b32 s4, s9, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 5 +; VI-NEXT: s_cmp_lg_u32 s6, 5 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 4 +; VI-NEXT: s_cmp_lg_u32 s6, 4 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v4, s9 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; VI-NEXT: s_lshr_b32 s5, s8, 24 +; VI-NEXT: s_lshr_b32 s4, s8, 24 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s4, 3 +; VI-NEXT: s_cmp_lg_u32 s6, 3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s8, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 2 +; VI-NEXT: s_lshr_b32 s4, s8, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 2 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s8, 8 +; VI-NEXT: s_lshr_b32 s4, s8, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 1 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v5, s8 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1977,26 +1977,26 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <1 define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-LABEL: insert_split_bb: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dword s6, s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_cbranch_scc0 .LBB42_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s5, s[2:3], 0x1 -; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; SI-NEXT: s_load_dword s7, s[2:3], 0x1 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccnz .LBB42_3 ; SI-NEXT: .LBB42_2: ; %if -; SI-NEXT: s_load_dword s5, s[2:3], 0x0 +; SI-NEXT: s_load_dword s7, s[2:3], 0x0 ; SI-NEXT: .LBB42_3: ; %endif ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB42_4: @@ -2004,23 +2004,23 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: insert_split_bb: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: s_cbranch_scc0 .LBB42_4 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_load_dword s5, s[2:3], 0x4 +; VI-NEXT: s_load_dword s7, s[2:3], 0x4 ; VI-NEXT: s_cbranch_execnz .LBB42_3 ; VI-NEXT: .LBB42_2: ; %if ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s5, s[2:3], 0x0 +; VI-NEXT: s_load_dword s7, s[2:3], 0x0 ; VI-NEXT: .LBB42_3: ; %endif ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB42_4: @@ -2050,9 +2050,9 @@ endif: define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x18 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x18 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2071,9 +2071,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x60 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x30 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x60 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2097,9 +2097,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2118,9 +2118,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2144,20 +2144,20 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s12, s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0xc +; SI-NEXT: s_load_dword s6, s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xc ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s12, 1 +; SI-NEXT: s_cmp_eq_u32 s6, 1 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_cselect_b32 s6, 0, s11 -; SI-NEXT: s_cselect_b32 s7, 5, s10 -; SI-NEXT: s_cmp_eq_u32 s12, 0 +; SI-NEXT: s_cselect_b32 s7, 0, s11 +; SI-NEXT: s_cselect_b32 s10, 5, s10 +; SI-NEXT: s_cmp_eq_u32 s6, 0 ; SI-NEXT: s_cselect_b32 s9, 0, s9 ; SI-NEXT: s_cselect_b32 s8, 5, s8 -; SI-NEXT: s_cmp_eq_u32 s12, 2 +; SI-NEXT: s_cmp_eq_u32 s6, 2 ; SI-NEXT: s_cselect_b32 s5, 0, s5 ; SI-NEXT: s_cselect_b32 s4, 5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -2165,27 +2165,27 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v3i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s12, s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x30 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s12, 1 +; VI-NEXT: s_cmp_eq_u32 s6, 1 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_cselect_b32 s6, 0, s11 -; VI-NEXT: s_cselect_b32 s7, 5, s10 -; VI-NEXT: s_cmp_eq_u32 s12, 0 +; VI-NEXT: s_cselect_b32 s7, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 5, s10 +; VI-NEXT: s_cmp_eq_u32 s6, 0 ; VI-NEXT: s_cselect_b32 s9, 0, s9 ; VI-NEXT: s_cselect_b32 s8, 5, s8 -; VI-NEXT: s_cmp_eq_u32 s12, 2 +; VI-NEXT: s_cmp_eq_u32 s6, 2 ; VI-NEXT: s_cselect_b32 s5, 0, s5 ; VI-NEXT: s_cselect_b32 s4, 5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -2193,8 +2193,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x i64> %a, i64 5, i32 %b @@ -2205,67 +2205,67 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x10 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s6, s[4:5], 0x10 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s4, 1 -; SI-NEXT: s_cselect_b32 s5, 0x40200000, s11 -; SI-NEXT: s_cselect_b32 s6, 0, s10 -; SI-NEXT: s_cmp_eq_u32 s4, 0 +; SI-NEXT: s_cmp_eq_u32 s6, 1 +; SI-NEXT: s_cselect_b32 s4, 0x40200000, s11 +; SI-NEXT: s_cselect_b32 s5, 0, s10 +; SI-NEXT: s_cmp_eq_u32 s6, 0 ; SI-NEXT: s_cselect_b32 s7, 0x40200000, s9 ; SI-NEXT: s_cselect_b32 s8, 0, s8 -; SI-NEXT: s_cmp_eq_u32 s4, 3 +; SI-NEXT: s_cmp_eq_u32 s6, 3 ; SI-NEXT: s_cselect_b32 s9, 0x40200000, s15 ; SI-NEXT: s_cselect_b32 s10, 0, s14 -; SI-NEXT: s_cmp_eq_u32 s4, 2 -; SI-NEXT: s_cselect_b32 s4, 0x40200000, s13 +; SI-NEXT: s_cmp_eq_u32 s6, 2 +; SI-NEXT: s_cselect_b32 s6, 0x40200000, s13 ; SI-NEXT: s_cselect_b32 s11, 0, s12 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_nop 0 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x40 -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x40 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s4, 1 -; VI-NEXT: s_cselect_b32 s5, 0x40200000, s11 -; VI-NEXT: s_cselect_b32 s6, 0, s10 -; VI-NEXT: s_cmp_eq_u32 s4, 0 +; VI-NEXT: s_cmp_eq_u32 s6, 1 +; VI-NEXT: s_cselect_b32 s4, 0x40200000, s11 +; VI-NEXT: s_cselect_b32 s5, 0, s10 +; VI-NEXT: s_cmp_eq_u32 s6, 0 ; VI-NEXT: s_cselect_b32 s7, 0x40200000, s9 ; VI-NEXT: s_cselect_b32 s8, 0, s8 -; VI-NEXT: s_cmp_eq_u32 s4, 3 +; VI-NEXT: s_cmp_eq_u32 s6, 3 ; VI-NEXT: s_cselect_b32 s9, 0x40200000, s15 ; VI-NEXT: s_cselect_b32 s10, 0, s14 -; VI-NEXT: s_cmp_eq_u32 s4, 2 -; VI-NEXT: s_cselect_b32 s4, 0x40200000, s13 +; VI-NEXT: s_cmp_eq_u32 s6, 2 +; VI-NEXT: s_cselect_b32 s6, 0x40200000, s13 ; VI-NEXT: s_cselect_b32 s11, 0, s12 ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x double> %a, double 8.0, i32 %b @@ -2276,13 +2276,13 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 { ; SI-LABEL: dynamic_insertelement_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 -; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s6, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 1 +; SI-NEXT: s_lshl_b32 s4, s6, 1 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 @@ -2311,13 +2311,13 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 1 +; VI-NEXT: s_lshl_b32 s4, s6, 1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index 3135addec16183..c9b01eb5a97255 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: s_insertelement_v2bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_insertelement_v2bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: s_insertelement_v2bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -48,7 +48,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: s_insertelement_v2bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -58,6 +58,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 0 store <2 x bfloat> %vecins, ptr addrspace(1) %out @@ -67,7 +68,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: s_insertelement_v2bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -81,7 +82,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_insertelement_v2bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -95,7 +96,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: s_insertelement_v2bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -107,7 +108,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: s_insertelement_v2bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -116,6 +117,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 1 store <2 x bfloat> %vecins, ptr addrspace(1) %out @@ -125,7 +127,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -142,7 +144,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v2bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -160,7 +162,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v2bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x40a0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -173,8 +175,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v2bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x40a0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -184,6 +185,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v1 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -197,7 +199,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_0_inlineimm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -214,7 +216,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2bf16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -232,7 +234,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; GFX900-LABEL: v_insertelement_v2bf16_0_inlineimm: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dword v1, v0, s[2:3] @@ -244,8 +246,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; GFX940-LABEL: v_insertelement_v2bf16_0_inlineimm: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dword v1, v0, s[2:3] @@ -254,6 +255,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; GFX940-NEXT: v_bfi_b32 v1, s2, 53, v1 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -267,7 +269,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -284,7 +286,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v2bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -302,7 +304,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v2bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -315,8 +317,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v2bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -326,6 +327,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_perm_b32 v1, s2, v1, v2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -339,7 +341,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_1_inlineimm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -356,7 +358,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2bf16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -374,7 +376,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; GFX900-LABEL: v_insertelement_v2bf16_1_inlineimm: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -386,8 +388,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; GFX940-LABEL: v_insertelement_v2bf16_1_inlineimm: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -396,6 +397,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; GFX940-NEXT: v_perm_b32 v1, 35, v1, v2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -409,8 +411,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 { ; SI-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; SI-NEXT: s_mov_b32 s11, 0x100f000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -432,8 +434,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; VI-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -459,11 +461,11 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; GFX900-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dword v1, v0, s[4:5] +; GFX900-NEXT: global_load_dword v1, v0, s[6:7] ; GFX900-NEXT: global_load_dword v2, v0, s[2:3] ; GFX900-NEXT: s_mov_b32 s2, 0xffff ; GFX900-NEXT: s_waitcnt vmcnt(1) @@ -477,14 +479,13 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; GFX940-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NEXT: s_mov_b32 s0, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NEXT: global_load_dword v2, v0, s[6:7] -; GFX940-NEXT: s_mov_b32 s0, 0xffff ; GFX940-NEXT: s_waitcnt vmcnt(1) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX940-NEXT: v_lshlrev_b32_e64 v1, v1, s0 @@ -493,6 +494,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; GFX940-NEXT: v_bfi_b32 v1, v1, s0, v2 ; GFX940-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -508,27 +510,27 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0xc -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0xc +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s5, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bfi_b32 v2, s5, v4, v2 +; SI-NEXT: v_bfi_b32 v2, s4, v4, v2 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -546,13 +548,13 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_mov_b32 s2, 0xffff -; GFX900-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-NEXT: v_mov_b32_e32 v3, s6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_bfi_b32 v0, s2, v3, v0 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -560,18 +562,18 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v4bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x30 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x30 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX940-NEXT: s_mov_b32 s1, 0xffff +; GFX940-NEXT: s_mov_b32 s0, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, s0 +; GFX940-NEXT: v_mov_b32_e32 v3, s2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v0, s1, v3, v0 +; GFX940-NEXT: v_bfi_b32 v0, s0, v3, v0 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -587,17 +589,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 @@ -606,8 +608,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v4bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -625,30 +627,30 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_perm_b32 v0, s4, v0, v3 +; GFX900-NEXT: v_perm_b32 v0, s6, v0, v3 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-NEXT: s_endpgm ; ; GFX940-LABEL: v_insertelement_v4bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_perm_b32 v0, s0, v0, v3 +; GFX940-NEXT: v_perm_b32 v0, s2, v0, v3 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -664,27 +666,27 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0xc -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0xc +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s5, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bfi_b32 v3, s5, v4, v3 +; SI-NEXT: v_bfi_b32 v3, s4, v4, v3 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4bf16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -702,13 +704,13 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_2: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_mov_b32 s2, 0xffff -; GFX900-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-NEXT: v_mov_b32_e32 v3, s6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_bfi_b32 v1, s2, v3, v1 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -716,18 +718,18 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v4bf16_2: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x30 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x30 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX940-NEXT: s_mov_b32 s1, 0xffff +; GFX940-NEXT: s_mov_b32 s0, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, s0 +; GFX940-NEXT: v_mov_b32_e32 v3, s2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v1, s1, v3, v1 +; GFX940-NEXT: v_bfi_b32 v1, s0, v3, v1 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -743,17 +745,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v3 @@ -762,8 +764,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v4bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -781,30 +783,30 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_perm_b32 v1, s4, v1, v3 +; GFX900-NEXT: v_perm_b32 v1, s6, v1, v3 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-NEXT: s_endpgm ; ; GFX940-LABEL: v_insertelement_v4bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_perm_b32 v1, s0, v1, v3 +; GFX940-NEXT: v_perm_b32 v1, s2, v1, v3 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -820,23 +822,23 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 { ; SI-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s6, s4, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s5, 4 -; SI-NEXT: s_or_b32 s6, s4, s6 -; SI-NEXT: s_lshl_b64 s[4:5], 0xffff, s5 -; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: v_mov_b32_e32 v5, s6 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: s_and_b32 s5, s8, 0xffff +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_lshl_b32 s6, s9, 4 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_lshl_b64 s[4:5], 0xffff, s6 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfi_b32 v3, s5, v4, v3 ; SI-NEXT: v_bfi_b32 v2, s4, v5, v2 @@ -845,8 +847,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -871,13 +873,13 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; GFX900-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX900-NEXT: s_lshl_b32 s2, s5, 4 -; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: s_lshl_b32 s2, s7, 4 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s6 ; GFX900-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 ; GFX900-NEXT: v_mov_b32_e32 v3, s4 ; GFX900-NEXT: v_mov_b32_e32 v4, s4 @@ -889,15 +891,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; GFX940-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: s_lshl_b32 s1, s1, 4 -; GFX940-NEXT: s_pack_ll_b32_b16 s2, s0, s0 -; GFX940-NEXT: s_lshl_b64 s[0:1], 0xffff, s1 +; GFX940-NEXT: s_lshl_b32 s0, s3, 4 +; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, s2 +; GFX940-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 ; GFX940-NEXT: v_mov_b32_e32 v3, s2 ; GFX940-NEXT: v_mov_b32_e32 v4, s2 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -905,6 +906,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; GFX940-NEXT: v_bfi_b32 v0, s0, v4, v0 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -920,17 +922,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; SI-LABEL: v_insertelement_v8bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -939,8 +941,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v8bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -959,8 +961,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v8bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -970,27 +972,27 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; GFX900-NEXT: v_bfi_b32 v3, s2, v3, v3 ; GFX900-NEXT: v_bfi_b32 v2, s2, v2, v2 ; GFX900-NEXT: v_bfi_b32 v0, s2, v0, v0 -; GFX900-NEXT: v_perm_b32 v1, s4, v1, v5 +; GFX900-NEXT: v_perm_b32 v1, s6, v1, v5 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-NEXT: s_endpgm ; ; GFX940-LABEL: v_insertelement_v8bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX940-NEXT: s_mov_b32 s1, 0xffff +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v3, s1, v3, v3 -; GFX940-NEXT: v_bfi_b32 v2, s1, v2, v2 -; GFX940-NEXT: v_bfi_b32 v0, s1, v0, v0 -; GFX940-NEXT: v_perm_b32 v1, s0, v1, v5 +; GFX940-NEXT: v_bfi_b32 v3, s0, v3, v3 +; GFX940-NEXT: v_bfi_b32 v2, s0, v2, v2 +; GFX940-NEXT: v_bfi_b32 v0, s0, v0, v0 +; GFX940-NEXT: v_perm_b32 v1, s2, v1, v5 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1006,48 +1008,48 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v8bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 -; SI-NEXT: s_cmp_eq_u32 s5, 6 -; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_cmp_eq_u32 s9, 6 +; SI-NEXT: v_mov_b32_e32 v6, s8 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 7 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_cmp_eq_u32 s9, 7 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 4 +; SI-NEXT: s_cmp_eq_u32 s9, 4 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 5 +; SI-NEXT: s_cmp_eq_u32 s9, 5 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 2 +; SI-NEXT: s_cmp_eq_u32 s9, 2 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: s_cmp_eq_u32 s9, 3 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v7, v3 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 0 +; SI-NEXT: s_cmp_eq_u32 s9, 0 ; SI-NEXT: v_or_b32_e32 v2, v2, v7 ; SI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 1 +; SI-NEXT: s_cmp_eq_u32 s9, 1 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1063,8 +1065,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v8bf16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1117,40 +1119,40 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; GFX900-LABEL: v_insertelement_v8bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX900-NEXT: s_cmp_eq_u32 s5, 6 -; GFX900-NEXT: v_mov_b32_e32 v5, s4 +; GFX900-NEXT: s_cmp_eq_u32 s7, 6 +; GFX900-NEXT: v_mov_b32_e32 v5, s6 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 7 +; GFX900-NEXT: s_cmp_eq_u32 s7, 7 ; GFX900-NEXT: s_mov_b32 s2, 0x5040100 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc ; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 4 +; GFX900-NEXT: s_cmp_eq_u32 s7, 4 ; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 5 +; GFX900-NEXT: s_cmp_eq_u32 s7, 5 ; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 2 +; GFX900-NEXT: s_cmp_eq_u32 s7, 2 ; GFX900-NEXT: v_perm_b32 v3, v3, v6, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 3 +; GFX900-NEXT: s_cmp_eq_u32 s7, 3 ; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 0 +; GFX900-NEXT: s_cmp_eq_u32 s7, 0 ; GFX900-NEXT: v_perm_b32 v2, v6, v2, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 1 +; GFX900-NEXT: s_cmp_eq_u32 s7, 1 ; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1162,49 +1164,49 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; GFX940-LABEL: v_insertelement_v8bf16_dynamic: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] -; GFX940-NEXT: s_cmp_eq_u32 s1, 6 -; GFX940-NEXT: v_mov_b32_e32 v5, s0 +; GFX940-NEXT: s_cmp_eq_u32 s3, 6 +; GFX940-NEXT: v_mov_b32_e32 v5, s2 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 7 +; GFX940-NEXT: s_cmp_eq_u32 s3, 7 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc ; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 4 +; GFX940-NEXT: s_cmp_eq_u32 s3, 4 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 5 +; GFX940-NEXT: s_cmp_eq_u32 s3, 5 ; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 2 -; GFX940-NEXT: v_perm_b32 v3, v3, v6, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 2 +; GFX940-NEXT: v_perm_b32 v3, v3, v6, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 3 +; GFX940-NEXT: s_cmp_eq_u32 s3, 3 ; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 0 -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 0 +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 1 +; GFX940-NEXT: s_cmp_eq_u32 s3, 1 ; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX940-NEXT: v_perm_b32 v1, v6, v1, s2 -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v6, v1, s0 +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s0 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1220,18 +1222,18 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; SI-LABEL: v_insertelement_v16bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v9, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[8:11], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[8:11], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_lshl_b32 s4, s8, 16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -1242,8 +1244,8 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_insertelement_v16bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1269,15 +1271,15 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; GFX900-LABEL: v_insertelement_v16bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX900-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX900-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_perm_b32 v1, s4, v1, v9 +; GFX900-NEXT: v_perm_b32 v1, s6, v1, v9 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1285,20 +1287,20 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; GFX940-LABEL: v_insertelement_v16bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX940-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:16 ; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_perm_b32 v1, s0, v1, v9 +; GFX940-NEXT: v_perm_b32 v1, s2, v1, v9 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1314,21 +1316,22 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v16bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s11, 0x100f000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 +; SI-NEXT: v_mov_b32_e32 v5, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: v_mov_b32_e32 v5, 0 ; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[8:11], 0 addr64 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s5, 6 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s5, 7 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cndmask_b32_e32 v11, v10, v6, vcc ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 @@ -1414,8 +1417,8 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; VI-LABEL: v_insertelement_v16bf16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -1511,74 +1514,74 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX900-LABEL: v_insertelement_v16bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 -; GFX900-NEXT: s_cmp_eq_u32 s5, 6 -; GFX900-NEXT: v_mov_b32_e32 v9, s4 +; GFX900-NEXT: s_cmp_eq_u32 s7, 6 +; GFX900-NEXT: v_mov_b32_e32 v9, s6 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 7 +; GFX900-NEXT: s_cmp_eq_u32 s7, 7 ; GFX900-NEXT: s_mov_b32 s2, 0x5040100 ; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc ; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 4 +; GFX900-NEXT: s_cmp_eq_u32 s7, 4 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 5 +; GFX900-NEXT: s_cmp_eq_u32 s7, 5 ; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 2 +; GFX900-NEXT: s_cmp_eq_u32 s7, 2 ; GFX900-NEXT: v_perm_b32 v4, v4, v10, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 3 +; GFX900-NEXT: s_cmp_eq_u32 s7, 3 ; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 0 +; GFX900-NEXT: s_cmp_eq_u32 s7, 0 ; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 1 +; GFX900-NEXT: s_cmp_eq_u32 s7, 1 ; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 14 +; GFX900-NEXT: s_cmp_eq_u32 s7, 14 ; GFX900-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 15 +; GFX900-NEXT: s_cmp_eq_u32 s7, 15 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v8 ; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 12 +; GFX900-NEXT: s_cmp_eq_u32 s7, 12 ; GFX900-NEXT: v_perm_b32 v1, v10, v1, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 13 +; GFX900-NEXT: s_cmp_eq_u32 s7, 13 ; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 10 +; GFX900-NEXT: s_cmp_eq_u32 s7, 10 ; GFX900-NEXT: v_perm_b32 v8, v10, v8, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 11 +; GFX900-NEXT: s_cmp_eq_u32 s7, 11 ; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 8 +; GFX900-NEXT: s_cmp_eq_u32 s7, 8 ; GFX900-NEXT: v_perm_b32 v7, v10, v7, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 9 +; GFX900-NEXT: s_cmp_eq_u32 s7, 9 ; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1591,84 +1594,84 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX940-LABEL: v_insertelement_v16bf16_dynamic: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:16 -; GFX940-NEXT: s_cmp_eq_u32 s1, 6 -; GFX940-NEXT: v_mov_b32_e32 v9, s0 +; GFX940-NEXT: s_cmp_eq_u32 s3, 6 +; GFX940-NEXT: v_mov_b32_e32 v9, s2 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 7 +; GFX940-NEXT: s_cmp_eq_u32 s3, 7 ; GFX940-NEXT: s_waitcnt vmcnt(1) ; GFX940-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc ; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 4 +; GFX940-NEXT: s_cmp_eq_u32 s3, 4 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 5 +; GFX940-NEXT: s_cmp_eq_u32 s3, 5 ; GFX940-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 2 -; GFX940-NEXT: v_perm_b32 v3, v3, v10, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 2 +; GFX940-NEXT: v_perm_b32 v3, v3, v10, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 3 +; GFX940-NEXT: s_cmp_eq_u32 s3, 3 ; GFX940-NEXT: v_lshrrev_b32_e32 v12, 16, v1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 0 -; GFX940-NEXT: v_perm_b32 v2, v10, v2, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 0 +; GFX940-NEXT: v_perm_b32 v2, v10, v2, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 1 +; GFX940-NEXT: s_cmp_eq_u32 s3, 1 ; GFX940-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 14 -; GFX940-NEXT: v_perm_b32 v1, v10, v1, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 14 +; GFX940-NEXT: v_perm_b32 v1, v10, v1, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 15 +; GFX940-NEXT: s_cmp_eq_u32 s3, 15 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; GFX940-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 12 -; GFX940-NEXT: v_perm_b32 v0, v10, v0, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 12 +; GFX940-NEXT: v_perm_b32 v0, v10, v0, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 13 +; GFX940-NEXT: s_cmp_eq_u32 s3, 13 ; GFX940-NEXT: v_lshrrev_b32_e32 v15, 16, v6 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 10 -; GFX940-NEXT: v_perm_b32 v7, v10, v7, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 10 +; GFX940-NEXT: v_perm_b32 v7, v10, v7, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 11 +; GFX940-NEXT: s_cmp_eq_u32 s3, 11 ; GFX940-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 8 -; GFX940-NEXT: v_perm_b32 v6, v10, v6, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 8 +; GFX940-NEXT: v_perm_b32 v6, v10, v6, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 9 +; GFX940-NEXT: s_cmp_eq_u32 s3, 9 ; GFX940-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX940-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX940-NEXT: v_perm_b32 v5, v10, v5, s2 -; GFX940-NEXT: v_perm_b32 v4, v9, v4, s2 +; GFX940-NEXT: v_perm_b32 v5, v10, v5, s0 +; GFX940-NEXT: v_perm_b32 v4, v9, v4, s0 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 647870f0e08979..1ba2491d2210ec 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -19,7 +19,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2i16_0: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2i16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -54,21 +54,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_pack_lh_b32_b16 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_0_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -83,8 +83,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; ; CI-LABEL: s_insertelement_v2i16_0_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; GFX11-LABEL: s_insertelement_v2i16_0_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -121,14 +121,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: ;;#ASMSTART @@ -138,8 +138,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; ; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; ; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -179,8 +179,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -207,21 +207,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_pack_hh_b32_b16 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -235,8 +235,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; CI-LABEL: s_insertelement_v2i16_0_reghi: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -251,8 +251,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; GFX11-LABEL: s_insertelement_v2i16_0_reghi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -274,12 +274,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -291,8 +291,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; ; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -310,8 +310,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; ; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -330,8 +330,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 @@ -359,12 +359,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2 @@ -380,8 +380,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; ; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -402,8 +402,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; ; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -425,8 +425,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 @@ -462,7 +462,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -474,7 +474,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2i16_1: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -488,7 +488,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2i16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -508,21 +508,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_1_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_1_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -537,8 +537,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; ; CI-LABEL: s_insertelement_v2i16_1_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -554,8 +554,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; GFX11-LABEL: s_insertelement_v2i16_1_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -575,7 +575,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -588,7 +588,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2f16_0: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -602,7 +602,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -623,7 +623,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -635,7 +635,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2f16_1: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -649,7 +649,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -669,7 +669,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -682,7 +682,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2i16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2i16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -718,9 +718,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2i16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -744,21 +742,21 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %elt.arg) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_perm_b32 v1, v1, s6, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -776,8 +774,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v2i16_0_reghi: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -795,12 +793,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v2i16_0_reghi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -824,7 +819,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -836,7 +831,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2i16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -854,7 +849,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2i16_0_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -872,9 +867,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -898,7 +891,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -911,7 +904,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2i16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -929,7 +922,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2i16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -947,9 +940,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2i16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -973,7 +964,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -985,7 +976,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2i16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1003,7 +994,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2i16_1_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1021,9 +1012,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1046,7 +1035,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1059,7 +1048,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1077,7 +1066,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2f16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1095,9 +1084,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1121,7 +1108,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1133,7 +1120,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2f16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1151,7 +1138,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2f16_0_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1169,9 +1156,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1194,7 +1179,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1207,7 +1192,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1225,7 +1210,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2f16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1243,9 +1228,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1269,7 +1252,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1281,7 +1264,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2f16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1299,7 +1282,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2f16_1_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1317,9 +1300,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1343,16 +1324,16 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(4) %idx.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, s6, 4 +; GFX9-NEXT: s_lshl_b32 s2, s4, 4 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 -; GFX9-NEXT: s_andn2_b32 s3, s7, s2 +; GFX9-NEXT: s_andn2_b32 s3, s5, s2 ; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1361,10 +1342,10 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: s_insertelement_v2i16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s4, s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1380,10 +1361,10 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: s_insertelement_v2i16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s4, s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x0 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1400,8 +1381,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; GFX11-LABEL: s_insertelement_v2i16_dynamic: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -1428,13 +1409,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 { ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_lshl_b32 s2, s4, 4 +; GFX9-NEXT: s_lshl_b32 s2, s6, 4 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 @@ -1443,8 +1424,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1464,8 +1445,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1485,15 +1466,13 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, s0, 0x3e703e7, v1 @@ -1514,11 +1493,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 { ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1532,8 +1511,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1559,8 +1538,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -1586,10 +1565,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; GFX11-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1620,13 +1597,13 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s2, v3, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1634,8 +1611,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1653,8 +1630,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1673,12 +1650,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1702,21 +1676,21 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, s4, v0, v3 +; GFX9-NEXT: v_perm_b32 v0, s6, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1734,8 +1708,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1754,12 +1728,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1783,13 +1754,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1797,8 +1768,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4f16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1816,8 +1787,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1836,12 +1807,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1865,21 +1833,21 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, s4, v1, v3 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1897,8 +1865,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1917,12 +1885,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1946,13 +1911,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4i16_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1960,8 +1925,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4i16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1979,8 +1944,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4i16_2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1999,12 +1964,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4i16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2029,11 +1991,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: global_load_dword v2, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff @@ -2048,11 +2010,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -2075,11 +2037,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: flat_load_dword v4, v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -2102,22 +2064,20 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: global_load_b32 v2, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 0xffff ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfi_b32 v1, v3, s0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_bfi_b32 v0, v2, s0, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 @@ -2139,13 +2099,13 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 { ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-NEXT: s_lshl_b32 s2, s5, 4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX9-NEXT: s_lshl_b32 s2, s7, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s6 ; GFX9-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v4, s4 @@ -2157,8 +2117,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2183,8 +2143,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2209,12 +2169,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_lshl_b32 s1, s1, 4 @@ -2242,21 +2199,21 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v8f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, s4, v1, v5 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v8f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2275,8 +2232,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v8f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2295,12 +2252,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v8f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2324,13 +2278,13 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v8i16_6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v3, s2, v5, v3 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] @@ -2338,8 +2292,8 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v8i16_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2358,8 +2312,8 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v8i16_6: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2378,12 +2332,9 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v8i16_6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2407,40 +2358,40 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v8f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX9-NEXT: s_cmp_eq_u32 s5, 6 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 7 +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 4 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 5 +; GFX9-NEXT: s_cmp_eq_u32 s7, 5 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 2 +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 ; GFX9-NEXT: v_perm_b32 v3, v3, v6, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 3 +; GFX9-NEXT: s_cmp_eq_u32 s7, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 ; GFX9-NEXT: v_perm_b32 v2, v6, v2, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 1 +; GFX9-NEXT: s_cmp_eq_u32 s7, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2452,8 +2403,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v8f16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2506,8 +2457,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v8f16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2576,12 +2527,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v8f16_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_cmp_eq_u32 s1, 6 @@ -2637,15 +2585,15 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v16f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_perm_b32 v1, s4, v1, v9 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -2653,8 +2601,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v16f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2680,8 +2628,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; CI-LABEL: v_insertelement_v16f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s3 @@ -2707,12 +2655,9 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_insertelement_v16f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] @@ -2741,14 +2686,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v16i16_6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfi_b32 v3, s2, v9, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2758,8 +2703,8 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v16i16_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2784,8 +2729,8 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; CI-LABEL: v_insertelement_v16i16_6: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2811,12 +2756,9 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_insertelement_v16i16_6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] @@ -2845,74 +2787,74 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v16f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 -; GFX9-NEXT: s_cmp_eq_u32 s5, 6 -; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 7 +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 4 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 5 +; GFX9-NEXT: s_cmp_eq_u32 s7, 5 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 2 +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 ; GFX9-NEXT: v_perm_b32 v4, v4, v10, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 3 +; GFX9-NEXT: s_cmp_eq_u32 s7, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 ; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 1 +; GFX9-NEXT: s_cmp_eq_u32 s7, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 14 +; GFX9-NEXT: s_cmp_eq_u32 s7, 14 ; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 15 +; GFX9-NEXT: s_cmp_eq_u32 s7, 15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 12 +; GFX9-NEXT: s_cmp_eq_u32 s7, 12 ; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 13 +; GFX9-NEXT: s_cmp_eq_u32 s7, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 10 +; GFX9-NEXT: s_cmp_eq_u32 s7, 10 ; GFX9-NEXT: v_perm_b32 v8, v10, v8, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 11 +; GFX9-NEXT: s_cmp_eq_u32 s7, 11 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 8 +; GFX9-NEXT: s_cmp_eq_u32 s7, 8 ; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 9 +; GFX9-NEXT: s_cmp_eq_u32 s7, 9 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2925,8 +2867,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v16f16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -3022,8 +2964,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v16f16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3152,12 +3094,9 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v16f16_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index aca4730122f901..df03e893703777 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -436,7 +436,7 @@ entry: define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -466,7 +466,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX90A-LABEL: udiv_i32: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -496,7 +496,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX10-LABEL: udiv_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX10-NEXT: s_sub_i32 s5, 0, s3 @@ -526,7 +526,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-FLATSCR-LABEL: udiv_i32: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -556,7 +556,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX11-LABEL: udiv_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX11-NEXT: s_sub_i32 s5, 0, s3 @@ -593,7 +593,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX12-LABEL: udiv_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cvt_f32_u32 s4, s3 ; GFX12-NEXT: s_sub_co_i32 s5, 0, s3 @@ -692,19 +692,19 @@ main_body: define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-LABEL: atomic_add_local: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_mul_i32 s1, s1, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB5_2: @@ -712,19 +712,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX90A-LABEL: atomic_add_local: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB5_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: s_mul_i32 s0, s0, 5 -; GFX90A-NEXT: v_mov_b32_e32 v1, s0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: s_mul_i32 s1, s1, 5 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: ds_add_u32 v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB5_2: @@ -732,18 +732,18 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX10-LABEL: atomic_add_local: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, exec_lo -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB5_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s1, s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX10-NEXT: s_mul_i32 s0, s0, 5 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: s_bcnt1_i32_b32 s1, s2 +; GFX10-NEXT: s_mul_i32 s1, s1, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: ds_add_u32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv @@ -752,19 +752,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX9-FLATSCR-LABEL: atomic_add_local: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], exec -; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec +; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s1, 5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB5_2: @@ -772,19 +772,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX11-LABEL: atomic_add_local: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB5_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX11-NEXT: s_bcnt1_i32_b32 s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s0, s0, 5 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 +; GFX11-NEXT: s_mul_i32 s1, s1, 5 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: ds_add_u32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -793,19 +793,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX12-LABEL: atomic_add_local: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB5_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX12-NEXT: s_bcnt1_i32_b32 s1, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s0, s0, 5 -; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 +; GFX12-NEXT: s_mul_i32 s1, s1, 5 +; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX12-NEXT: ds_add_u32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -894,10 +894,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -906,8 +906,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -923,10 +923,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB7_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX90A-NEXT: s_mul_i32 s4, s4, 5 @@ -935,8 +935,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB7_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -947,26 +947,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: atomic_add_ret_local: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s1, exec_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB7_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10-NEXT: s_mul_i32 s1, s1, 5 -; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: s_mul_i32 s3, s3, 5 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -982,10 +982,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-FLATSCR-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 @@ -994,8 +994,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB7_2: -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1006,26 +1006,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: atomic_add_ret_local: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB7_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s1, s1, 5 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_mul_i32 s3, s3, 5 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB7_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1037,26 +1037,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: atomic_add_ret_local: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB7_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s1, s1, 5 -; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: s_mul_i32 s3, s3, 5 +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB7_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 @@ -1083,10 +1083,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -1094,8 +1094,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1111,10 +1111,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB8_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX90A-NEXT: s_mul_i32 s4, s4, 5 @@ -1122,8 +1122,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB8_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -1134,24 +1134,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: add_i32_constant: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, exec_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB8_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10-NEXT: s_mul_i32 s1, s1, 5 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: s_mul_i32 s3, s3, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB8_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1167,10 +1167,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 @@ -1178,8 +1178,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB8_2: -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1190,25 +1190,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: add_i32_constant: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s1, s1, 5 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_mul_i32 s3, s3, 5 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1220,25 +1220,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: add_i32_constant: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB8_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s1, s1, 5 -; GFX12-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-NEXT: s_mul_i32 s3, s3, 5 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: .LBB8_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll index b49931379b84a5..6c8646968b6762 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -59,10 +59,10 @@ define void @func_regular_call() #1 { ; GCN-LABEL: {{^}}func_tail_call: ; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, -; GCN-NEXT: s_addc_u32 s17, -; GCN-NEXT: s_setpc_b64 s[16:17] +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, +; GCN-NEXT: s_addc_u32 s5, +; GCN-NEXT: s_setpc_b64 s[4:5] ; GCN: ; NumSgprs: 32 ; GCN: ; NumVgprs: 8 diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll index 496a1c652da251..2370ceff89bd57 100644 --- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll +++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll @@ -7,11 +7,11 @@ declare void @llvm.trap() #0 ; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_kernarg_size 8 -; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12 +; DOORBELL-NEXT: .amdhsa_user_sgpr_count 6 ; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 ; DOORBELL: .end_amdhsa_kernel -define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #0 { +define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { store volatile i32 1, ptr addrspace(1) %arg0 call void @llvm.trap() unreachable @@ -19,7 +19,5 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #0 { ret void } -attributes #0 = { "amdgpu-no-implicitarg-ptr" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index f9073be7e260b8..69f181fcede30f 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -8,11 +8,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind { ; SI-LABEL: i8_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_and_b32 s4, s2, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -20,10 +20,10 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; VI-LABEL: i8_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xff +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,8 +32,8 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; GFX9-LABEL: i8_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -80,11 +80,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind { ; SI-LABEL: i8_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_and_b32 s4, s2, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -92,10 +92,10 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; VI-LABEL: i8_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xff +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -104,8 +104,8 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; GFX9-LABEL: i8_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -155,11 +155,11 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind { ; SI-LABEL: i8_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i8 s4, s4 +; SI-NEXT: s_sext_i32_i8 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -167,10 +167,10 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; VI-LABEL: i8_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i8 s2, s4 +; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -179,8 +179,8 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; GFX9-LABEL: i8_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -230,11 +230,11 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind { ; SI-LABEL: i16_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -242,10 +242,10 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; VI-LABEL: i16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -254,8 +254,8 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; GFX9-LABEL: i16_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -302,11 +302,11 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind { ; SI-LABEL: i16_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -314,10 +314,10 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; VI-LABEL: i16_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -326,8 +326,8 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; GFX9-LABEL: i16_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -377,11 +377,11 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind { ; SI-LABEL: i16_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i16 s4, s4 +; SI-NEXT: s_sext_i32_i16 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -389,10 +389,10 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; VI-LABEL: i16_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s2, s4 +; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -401,8 +401,8 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; GFX9-LABEL: i16_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -452,8 +452,8 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind { ; SI-LABEL: i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -463,19 +463,19 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou ; ; VI-LABEL: i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -511,8 +511,8 @@ entry: define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind { ; SI-LABEL: f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -522,19 +522,19 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n ; ; VI-LABEL: f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -570,8 +570,8 @@ entry: define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; SI-LABEL: v2i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -581,19 +581,19 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; ; VI-LABEL: v2i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -659,8 +659,8 @@ entry: define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; SI-LABEL: v2i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -670,19 +670,19 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; ; VI-LABEL: v2i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -718,7 +718,7 @@ entry: define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind { ; SI-LABEL: v2i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -731,7 +731,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; VI-LABEL: v2i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -742,7 +742,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; GFX9-LABEL: v2i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -781,7 +781,7 @@ entry: define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind { ; SI-LABEL: v2f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -794,7 +794,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; VI-LABEL: v2f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -805,7 +805,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; GFX9-LABEL: v2f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -844,8 +844,8 @@ entry: define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { ; SI-LABEL: v3i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s4, 16 @@ -858,26 +858,26 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i ; ; VI-LABEL: v3i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 +; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 2 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_byte v[2:3], v5 ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -983,7 +983,7 @@ entry: define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { ; SI-LABEL: v3i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -998,7 +998,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; VI-LABEL: v3i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1014,7 +1014,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; GFX9-LABEL: v3i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1102,8 +1102,8 @@ entry: define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { ; SI-LABEL: v3i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,8 +1117,8 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; VI-LABEL: v3i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1130,14 +1130,14 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; GFX9-LABEL: v3i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3i32_arg: @@ -1181,8 +1181,8 @@ entry: define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { ; SI-LABEL: v3f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,8 +1196,8 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; VI-LABEL: v3f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1209,14 +1209,14 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; GFX9-LABEL: v3f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3f32_arg: @@ -1260,8 +1260,8 @@ entry: define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; SI-LABEL: v4i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1271,19 +1271,19 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; ; VI-LABEL: v4i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v4i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1319,7 +1319,7 @@ entry: define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; SI-LABEL: v4i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; VI-LABEL: v4i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -1343,7 +1343,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; GFX9-LABEL: v4i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1382,8 +1382,8 @@ entry: define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind { ; SI-LABEL: v4i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; VI-LABEL: v4i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1410,15 +1410,15 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; GFX9-LABEL: v4i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4i32_arg: @@ -1456,8 +1456,8 @@ entry: define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind { ; SI-LABEL: v4f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1470,8 +1470,8 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; VI-LABEL: v4f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1484,15 +1484,15 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; GFX9-LABEL: v4f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4f32_arg: @@ -1530,7 +1530,7 @@ entry: define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind { ; SI-LABEL: v5i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; VI-LABEL: v5i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; GFX9-LABEL: v5i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1671,50 +1671,50 @@ entry: define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind { ; SI-LABEL: v5i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s2, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s5, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s5, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 8 +; VI-NEXT: s_add_u32 s4, s2, 8 ; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: flat_store_short v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_short v2, v3, s[4:5] offset:8 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_short v2, v3, s[6:7] offset:8 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i16_arg: @@ -1902,27 +1902,27 @@ entry: define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind { ; SI-LABEL: v5i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s8, s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s7, s[2:3], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 16 ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -1941,9 +1941,9 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; ; GFX9-LABEL: v5i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s8, s[6:7], 0x30 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, s8 @@ -1951,8 +1951,8 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dword v4, v5, s[4:5] offset:16 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dword v4, v5, s[6:7] offset:16 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i32_arg: @@ -2000,27 +2000,27 @@ entry: define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind { ; SI-LABEL: v5f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s8, s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s7, s[2:3], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -2039,19 +2039,19 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float ; ; GFX9-LABEL: v5f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x30 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: global_store_dword v4, v0, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: global_store_dword v4, v0, s[6:7] offset:16 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5f32_arg: @@ -2099,34 +2099,34 @@ entry: define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind { ; SI-LABEL: v5i64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2155,9 +2155,9 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> ; ; GFX9-LABEL: v5i64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -2241,34 +2241,34 @@ entry: define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind { ; SI-LABEL: v5f64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2297,9 +2297,9 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; ; GFX9-LABEL: v5f64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -2384,7 +2384,7 @@ entry: define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; SI-LABEL: v8i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2397,7 +2397,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; VI-LABEL: v8i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -2408,7 +2408,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; GFX9-LABEL: v8i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2635,8 +2635,8 @@ entry: define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; SI-LABEL: v8i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,8 +2649,8 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; VI-LABEL: v8i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2663,15 +2663,15 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; GFX9-LABEL: v8i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v8i16_arg: @@ -2883,8 +2883,8 @@ entry: define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { ; SI-LABEL: v8i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2903,8 +2903,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; VI-LABEL: v8i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -2926,8 +2926,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; GFX9-LABEL: v8i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -2994,8 +2994,8 @@ entry: define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind { ; SI-LABEL: v8f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3014,8 +3014,8 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; VI-LABEL: v8f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -3037,8 +3037,8 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; GFX9-LABEL: v8f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -3106,8 +3106,8 @@ entry: define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; SI-LABEL: v16i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3120,8 +3120,8 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; VI-LABEL: v16i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3134,15 +3134,15 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; GFX9-LABEL: v16i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v16i8_arg: @@ -3556,8 +3556,8 @@ entry: define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; SI-LABEL: v16i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3576,8 +3576,8 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; VI-LABEL: v16i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -3599,8 +3599,8 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; GFX9-LABEL: v16i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -4012,8 +4012,8 @@ entry: define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind { ; SI-LABEL: v16i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4044,8 +4044,8 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; VI-LABEL: v16i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_add_u32 s2, s0, 48 @@ -4085,8 +4085,8 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; GFX9-LABEL: v16i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 @@ -4200,8 +4200,8 @@ entry: define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind { ; SI-LABEL: v16f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4232,8 +4232,8 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; VI-LABEL: v16f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_add_u32 s2, s0, 48 @@ -4273,8 +4273,8 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; GFX9-LABEL: v16f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 @@ -4388,7 +4388,7 @@ entry: define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: kernel_arg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4401,7 +4401,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; VI-LABEL: kernel_arg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4412,7 +4412,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; GFX9-LABEL: kernel_arg_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4450,7 +4450,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; SI-LABEL: f64_kernel_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4463,7 +4463,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: f64_kernel_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4474,7 +4474,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; GFX9-LABEL: f64_kernel_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4522,8 +4522,8 @@ entry: define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind { ; SI-LABEL: i65_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s8, s4, 1 @@ -4539,8 +4539,8 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; VI-LABEL: i65_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -4558,11 +4558,11 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; GFX9-LABEL: i65_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_and_b32 s4, s6, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -4640,11 +4640,11 @@ entry: define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 1 +; SI-NEXT: s_and_b32 s4, s2, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -4652,10 +4652,10 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; VI-LABEL: i1_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4664,8 +4664,8 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; GFX9-LABEL: i1_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4731,11 +4731,11 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 1 +; SI-NEXT: s_and_b32 s4, s2, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4743,10 +4743,10 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4755,8 +4755,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4803,8 +4803,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4816,11 +4816,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -4829,8 +4829,8 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4879,11 +4879,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s4, s4, 0x10000 +; SI-NEXT: s_bfe_i32 s4, s2, 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4891,10 +4891,10 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s2, s4, 0x10000 +; VI-NEXT: s_bfe_i32 s2, s2, 0x10000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4903,8 +4903,8 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_sext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000 @@ -4953,11 +4953,11 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -4966,21 +4966,21 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i1_arg_sext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 @@ -5062,10 +5062,10 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { ; SI-LABEL: struct_argument_alignment: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb -; SI-NEXT: s_load_dword s9, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x11 +; SI-NEXT: s_load_dword s8, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dword s9, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -5089,46 +5089,46 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; ; VI-LABEL: struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s5, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x44 +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s5, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: struct_argument_alignment: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s5, s[6:7], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x20 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -5196,7 +5196,6 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { ; SI-LABEL: packed_struct_argument_alignment: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_load_dword s6, s[0:1], 0x9 @@ -5230,37 +5229,37 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; ; VI-LABEL: packed_struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 49 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: s_add_u32 s4, s2, 50 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_u32 s0, s0, 3 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_add_u32 s0, s2, 51 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s2, s0, 49 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s4, s0, 50 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_add_u32 s2, s2, 3 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 51 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: s_add_u32 s0, s2, 53 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s2, s0, 53 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x28 +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dword v[2:3], v7 @@ -5281,10 +5280,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; GFX9-LABEL: packed_struct_argument_alignment: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_dword v6, v2, s[6:7] offset:13 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] offset:17 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x4 +; GFX9-NEXT: global_load_dword v6, v2, s[4:5] offset:13 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5380,11 +5379,11 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { ; SI-LABEL: struct_argument_alignment_after: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s12, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb -; SI-NEXT: s_load_dword s13, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x15 +; SI-NEXT: s_load_dword s12, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; SI-NEXT: s_load_dword s13, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -5414,11 +5413,11 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; VI-LABEL: struct_argument_alignment_after: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; VI-NEXT: s_load_dword s9, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x54 +; VI-NEXT: s_load_dword s8, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dword s9, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5446,19 +5445,19 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; GFX9-LABEL: struct_argument_alignment_after: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x30 +; GFX9-NEXT: s_load_dword s10, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s11, s[4:5], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 @@ -5546,7 +5545,7 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; SI-LABEL: array_3xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5566,7 +5565,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; VI-LABEL: array_3xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5584,7 +5583,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; GFX9-LABEL: array_3xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -5660,8 +5659,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; SI-LABEL: array_3xi16: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:42 @@ -5681,22 +5679,22 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; ; VI-LABEL: array_3xi16: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 38 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_add_u32 s0, s2, 42 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s2, s0, 38 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s0, 42 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_ushort v4, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5713,10 +5711,10 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; GFX9-LABEL: array_3xi16: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4 -; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] offset:2 -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] offset:4 +; GFX9-NEXT: global_load_ushort v3, v0, s[4:5] offset:2 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -5831,7 +5829,6 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; SI-LABEL: small_array_round_down_offset: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:37 @@ -5842,8 +5839,8 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; ; VI-LABEL: small_array_round_down_offset: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 37 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 37 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ubyte v0, v[0:1] @@ -5855,7 +5852,7 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; GFX9-LABEL: small_array_round_down_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] offset:1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5889,8 +5886,8 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_align_constant_i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x49 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x49 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5904,13 +5901,13 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; VI-LABEL: byref_align_constant_i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x124 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 @@ -5919,8 +5916,8 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; GFX9-LABEL: byref_align_constant_i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x100 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -5973,83 +5970,83 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_natural_align_constant_v16i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s20, s[2:3], 0x29 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x29 +; SI-NEXT: s_mov_b32 s23, 0xf000 +; SI-NEXT: s_mov_b32 s22, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: byref_natural_align_constant_v16i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s20, s[0:1], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_add_u32 s2, s0, 48 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: s_add_u32 s0, s2, 48 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s2, 32 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s20 @@ -6059,9 +6056,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; ; GFX9-LABEL: byref_natural_align_constant_v16i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x80 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x80 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll index f74f9a8f2bdd82..1a73df341108fe 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -10,28 +10,28 @@ ; GCN: s_and_b32 ; HSA-VI: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) #0 { +define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { store i1 %x, ptr addrspace(1) %out, align 1 ret void } ; FUNC-LABEL: {{^}}v3i8_arg: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x8 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 ; HSA-VI: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) #0 { +define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { entry: store <3 x i8> %in, ptr addrspace(1) %out, align 4 ret void } ; FUNC-LABEL: {{^}}i65_arg: -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 ; HSA-VI: .amdhsa_kernarg_size 24 -define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) #0 { +define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind { entry: store i65 %in, ptr addrspace(1) %out, align 4 ret void @@ -39,7 +39,7 @@ entry: ; FUNC-LABEL: {{^}}empty_struct_arg: ; HSA-VI: .amdhsa_kernarg_size 0 -define amdgpu_kernel void @empty_struct_arg({} %in) #0 { +define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { ret void } @@ -54,13 +54,13 @@ define amdgpu_kernel void @empty_struct_arg({} %in) #0 { ; FIXME: Total argument size is computed wrong ; FUNC-LABEL: {{^}}struct_argument_alignment: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x20 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 ; HSA-VI: .amdhsa_kernarg_size 40 -define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) #0 { +define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { %val0 = extractvalue {i32, i64} %arg0, 0 %val1 = extractvalue {i32, i64} %arg0, 1 %val2 = extractvalue {i32, i64} %arg1, 0 @@ -78,11 +78,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; HSA-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; HSA-VI: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13 ; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x4 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 ; HSA-VI: .amdhsa_kernarg_size 28 -define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) #0 { +define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 %val2 = extractvalue <{i32, i64}> %arg1, 0 @@ -95,14 +95,14 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, } ; GCN-LABEL: {{^}}struct_argument_alignment_after: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x20 -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x30 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 ; HSA-VI: .amdhsa_kernarg_size 64 -define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) #0 { +define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { %val0 = extractvalue {i32, i64} %arg0, 0 %val1 = extractvalue {i32, i64} %arg0, 1 %val2 = extractvalue {i32, i64} %arg2, 0 @@ -116,7 +116,7 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, } ; GCN-LABEL: {{^}}array_3xi32: -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { store volatile i16 %arg0, ptr addrspace(1) undef store volatile [3 x i32] %arg1, ptr addrspace(1) undef @@ -124,7 +124,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { } ; GCN-LABEL: {{^}}array_3xi16: -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { store volatile i8 %arg0, ptr addrspace(1) undef store volatile [3 x i16] %arg1, ptr addrspace(1) undef @@ -135,7 +135,7 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; GCN: s_load_dword [[DWORD:s[0-9]+]] ; GCN-DAG: s_bfe_u32 [[BFE:s[0-9]+]], [[DWORD]], 0x100010{{$}} ; GCN-DAG: s_and_b32 [[AND:s[0-9]+]], [[DWORD]], 0x7fff{{$}} -define amdgpu_kernel void @v2i15_arg(ptr addrspace(1) nocapture %out, <2 x i15> %in) #0 { +define amdgpu_kernel void @v2i15_arg(ptr addrspace(1) nocapture %out, <2 x i15> %in) { entry: store <2 x i15> %in, ptr addrspace(1) %out, align 4 ret void @@ -147,7 +147,7 @@ entry: ; GCN: s_and_b32 ; GCN: s_and_b32 ; GCN: s_or_b32 -define amdgpu_kernel void @v3i15_arg(ptr addrspace(1) nocapture %out, <3 x i15> %in) #0 { +define amdgpu_kernel void @v3i15_arg(ptr addrspace(1) nocapture %out, <3 x i15> %in) { entry: store <3 x i15> %in, ptr addrspace(1) %out, align 4 ret void @@ -156,9 +156,9 @@ entry: ; Byref pointers should only be treated as offsets from kernarg ; GCN-LABEL: {{^}}byref_constant_i8_arg: ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[8:9] offset:8 +; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8 ; GCN: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) #0 { +define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) { %in = load i8, ptr addrspace(4) %in.byref %ext = zext i8 %in to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -167,9 +167,9 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out ; GCN-LABEL: {{^}}byref_constant_i16_arg: ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[8:9] offset:8 +; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8 ; GCN: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) #0 { +define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) { %in = load i16, ptr addrspace(4) %in.byref %ext = zext i16 %in to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -177,9 +177,9 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou } ; GCN-LABEL: {{^}}byref_constant_i32_arg: -; GCN: s_load_dwordx4 [[LOAD:s\[[0-9]+:[0-9]+\]]], s[8:9], 0x0{{$}} +; GCN: s_load_dwordx4 [[LOAD:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}} ; GCN: .amdhsa_kernarg_size 16 -define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) #0 { +define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) { %in = load i32, ptr addrspace(4) %in.byref store volatile i32 %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -187,10 +187,10 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou } ; GCN-LABEL: {{^}}byref_constant_v4i32_arg: -; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x10{{$}} -; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x20{{$}} +; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10{{$}} +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x20{{$}} ; GCN: .amdhsa_kernarg_size 36 -define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) #0 { +define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) { %in = load <4 x i32>, ptr addrspace(4) %in.byref store volatile <4 x i32> %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -198,13 +198,13 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture % } ; GCN-LABEL: {{^}}byref_align_constant_i32_arg: -; GCN-DAG: s_load_dwordx2 s[[[IN:[0-9]+]]:[[AFTER_OFFSET:[0-9]+]]], s[8:9], 0x100{{$}} +; GCN-DAG: s_load_dwordx2 s[[[IN:[0-9]+]]:[[AFTER_OFFSET:[0-9]+]]], s[4:5], 0x100{{$}} ; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], s[[IN]] ; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], s[[AFTER_OFFSET]] ; GCN: global_store_dword v{{[0-9]+}}, [[V_IN]], s ; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s ; GCN: .amdhsa_kernarg_size 264 -define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { +define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { %in = load i32, ptr addrspace(4) %in.byref store volatile i32 %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -212,10 +212,10 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu } ; GCN-LABEL: {{^}}byref_natural_align_constant_v16i32_arg: -; GCN-DAG: s_load_dword s{{[0-9]+}}, s[8:9], 0x80 -; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x40{{$}} +; GCN-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80 +; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}} ; GCN: .amdhsa_kernarg_size 132 -define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) #0 { +define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { %in = load <16 x i32>, ptr addrspace(4) %in.byref store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -224,9 +224,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; Also accept byref kernel arguments with other global address spaces. ; GCN-LABEL: {{^}}byref_global_i32_arg: -; GCN: s_load_dword [[IN:s[0-9]+]], s[8:9], 0x8{{$}} +; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} ; GCN: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) #0 { +define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) { %in = load i32, ptr addrspace(1) %in.byref store i32 %in, ptr addrspace(1) %out, align 4 ret void @@ -234,17 +234,17 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ; GCN-LABEL: {{^}}byref_flat_i32_arg: ; GCN: flat_load_dword [[IN:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} -define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) #0 { +define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) { %in = load i32, ptr %in.byref store i32 %in, ptr addrspace(1) %out, align 4 ret void } ; GCN-LABEL: {{^}}byref_constant_32bit_i32_arg: -; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s8, 8 +; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s4, 8 ; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}} ; GCN: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}} -define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) #0 { +define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) { %in = load i32, ptr addrspace(6) %in.byref store i32 %in, ptr addrspace(1) %out, align 4 ret void @@ -257,9 +257,9 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu ; } ; GCN-LABEL: {{^}}multi_byref_constant_i32_arg: -; GCN: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 +; GCN: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 ; GCN: .amdhsa_kernarg_size 20 -define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) #0 { +define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) { %in0 = load i32, ptr addrspace(4) %in0.byref %in1 = load i32, ptr addrspace(4) %in1.byref store volatile i32 %in0, ptr addrspace(1) %out, align 4 @@ -271,15 +271,13 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu ; GCN-LABEL: {{^}}byref_constant_i32_arg_offset0: ; GCN-NOT: s4 ; GCN-NOT: s5 -; GCN: s_load_dword {{s[0-9]+}}, s[8:9], 0x0{{$}} +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x0{{$}} ; GCN: .amdhsa_kernarg_size 4 -define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) #0 { +define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) { %in = load i32, ptr addrspace(4) %in.byref store i32 %in, ptr addrspace(1) undef, align 4 ret void } -attributes #0 = { "amdgpu-no-implicitarg-ptr" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll index 0a70734a65c206..3e0ad65c498213 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -114,9 +114,9 @@ define amdgpu_ps void @only_kill() #0 { ; CHECK-NEXT: ; %bb.3: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB2_4: -; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: exp null off, off, off, off done vm -; CHECK-NEXT: s_endpgm +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null off, off, off, off done vm +; CHECK-NEXT: s_endpgm main_body: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll index 7698372b687797..cb6073e9341e04 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -26,20 +26,20 @@ ; GCNHSA: .amdhsa_group_segment_fixed_size 0 ; GCNHSA: .amdhsa_private_segment_fixed_size 32772 ; GCNHSA: .amdhsa_user_sgpr_private_segment_buffer 1 -; GCNHSA: .amdhsa_user_sgpr_dispatch_ptr 1 -; GCNHSA: .amdhsa_user_sgpr_queue_ptr 1 +; GCNHSA: .amdhsa_user_sgpr_dispatch_ptr 0 +; GCNHSA: .amdhsa_user_sgpr_queue_ptr 0 ; GCNHSA: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GCNHSA: .amdhsa_user_sgpr_dispatch_id 1 +; GCNHSA: .amdhsa_user_sgpr_dispatch_id 0 ; GCNHSA: .amdhsa_user_sgpr_flat_scratch_init 1 ; GCNHSA: .amdhsa_user_sgpr_private_segment_size 0 ; GCNHSA: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GCNHSA: .amdhsa_system_sgpr_workgroup_id_x 1 -; GCNHSA: .amdhsa_system_sgpr_workgroup_id_y 1 -; GCNHSA: .amdhsa_system_sgpr_workgroup_id_z 1 +; GCNHSA: .amdhsa_system_sgpr_workgroup_id_y 0 +; GCNHSA: .amdhsa_system_sgpr_workgroup_id_z 0 ; GCNHSA: .amdhsa_system_sgpr_workgroup_info 0 -; GCNHSA: .amdhsa_system_vgpr_workitem_id 2 +; GCNHSA: .amdhsa_system_vgpr_workitem_id 0 ; GCNHSA: .amdhsa_next_free_vgpr 3 -; GCNHSA: .amdhsa_next_free_sgpr 18 +; GCNHSA: .amdhsa_next_free_sgpr 10 ; GCNHSA: .amdhsa_float_round_mode_32 0 ; GCNHSA: .amdhsa_float_round_mode_16_64 0 ; GCNHSA: .amdhsa_float_denorm_mode_32 3 diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 266ab687cd8d50..9619cb73b1538e 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -93,7 +93,7 @@ define void @use_extern_overalign() #0 { define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -113,27 +113,23 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 4 @@ -156,7 +152,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -176,27 +172,23 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -219,7 +211,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -239,27 +231,23 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -282,7 +270,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -302,27 +290,23 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -352,29 +336,25 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, 2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_mov_b32 s15, 0 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v3, v4 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_normal @@ -385,37 +365,33 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 4 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 4 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -429,29 +405,25 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, 2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_mov_b32 s15, 2 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v3, v4 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_overalign @@ -462,37 +434,33 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 6 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 6 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -506,29 +474,25 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, 2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_mov_b32 s15, 1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v3, v4 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_normal @@ -539,37 +503,33 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 5 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 5 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -583,29 +543,25 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, 2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_mov_b32 s15, 3 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v3, v4 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_overalign @@ -616,37 +572,33 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 7 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 7 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll index 9899d20cf3ae60..e1124f3ba89b5b 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: not llc -mtriple=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s @@ -11,21 +11,21 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p) { ; GCN-LABEL: name: load_zeroinit_lds_global ; GCN: bb.0 (%ir-block.0): - ; GCN: liveins: $sgpr2_sgpr3 - ; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 - ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 - ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 - ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 - ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX8: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3 + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 + ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec ; GCN: SI_INIT_M0 -1, implicit-def $m0 ; GCN: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 killed [[V_MOV_B32_e32_]], 40, 0, implicit $m0, implicit $exec - ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] - ; GFX8: BUFFER_STORE_DWORD_OFFSET killed [[DS_READ_B32_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec - ; GFX9: FLAT_STORE_DWORD killed [[COPY1]], killed [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr + ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX6: BUFFER_STORE_DWORD_OFFSET killed [[DS_READ_B32_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec + ; GFX8: FLAT_STORE_DWORD killed [[COPY1]], killed [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr ; GCN: S_ENDPGM 0 %gep = getelementptr [256 x i32], ptr addrspace(3) @lds, i32 0, i32 10 %ld = load i32, ptr addrspace(3) %gep diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index b61838c06a1f9d..952e89edeb7995 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -188,6 +188,9 @@ ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Lower OpenCL enqueued blocks ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions +; GCN-O1-NEXT: AMDGPU Attributor +; GCN-O1-NEXT: FunctionPass Manager +; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces ; GCN-O1-NEXT: Dominator Tree Construction @@ -462,6 +465,9 @@ ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Lower OpenCL enqueued blocks ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions +; GCN-O1-OPTS-NEXT: AMDGPU Attributor +; GCN-O1-OPTS-NEXT: FunctionPass Manager +; GCN-O1-OPTS-NEXT: Cycle Info Analysis ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -766,6 +772,9 @@ ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Lower OpenCL enqueued blocks ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions +; GCN-O2-NEXT: AMDGPU Attributor +; GCN-O2-NEXT: FunctionPass Manager +; GCN-O2-NEXT: Cycle Info Analysis ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces ; GCN-O2-NEXT: Dominator Tree Construction @@ -1074,6 +1083,9 @@ ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Lower OpenCL enqueued blocks ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions +; GCN-O3-NEXT: AMDGPU Attributor +; GCN-O3-NEXT: FunctionPass Manager +; GCN-O3-NEXT: Cycle Info Analysis ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll index 9445f1225e0cbe..9b63a8a3efcf92 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll @@ -9,7 +9,7 @@ define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -27,7 +27,7 @@ define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -44,7 +44,7 @@ define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: @@ -60,7 +60,7 @@ define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -79,7 +79,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -97,7 +97,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inre ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -114,7 +114,7 @@ define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 in ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -132,7 +132,7 @@ define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -149,7 +149,7 @@ define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: @@ -165,7 +165,7 @@ define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -184,7 +184,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %r ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -202,7 +202,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> i ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll index 61f0f20f057043..5a15dc53a292cd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll @@ -510,4 +510,4 @@ true: ret i32 42 false: ret i32 33 -} +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll index be270439ef57c4..ca7385be5dee7b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_i16_i32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll index 50561de5bdbd20..b59e584418bd8e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_u16_u32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll index ce6336da4fd962..0093e30b036444 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll index 66b4f143c60d07..d896090a476651 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index e1caf3bea61197..920ff8a927e2d1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, v0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 @@ -51,7 +51,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 @@ -67,8 +67,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -78,10 +78,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -89,33 +89,33 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s4, s4 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s4, s4 +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -145,8 +145,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -165,8 +165,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -188,13 +188,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -203,13 +203,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -218,10 +218,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -248,7 +246,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -264,7 +262,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -281,7 +279,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -292,7 +290,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -303,9 +301,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -328,7 +324,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -344,7 +340,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -361,7 +357,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -372,7 +368,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -383,9 +379,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -408,8 +402,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -428,8 +422,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -451,13 +445,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -466,13 +460,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -481,10 +475,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -512,8 +504,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -532,8 +524,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -555,13 +547,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -570,13 +562,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -585,10 +577,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -616,8 +606,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -636,8 +626,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -659,13 +649,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -674,13 +664,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -689,10 +679,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -721,8 +709,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -741,8 +729,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -764,13 +752,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -779,13 +767,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -794,10 +782,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll index 50f1beba252272..f8a1388c9415e7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll @@ -4,54 +4,19 @@ declare i64 @llvm.amdgcn.dispatch.id() #1 ; GCN-LABEL: {{^}}dispatch_id: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s10 -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s11 -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] -; GCN: .amdhsa_user_sgpr_dispatch_id 1 -define amdgpu_kernel void @dispatch_id(ptr addrspace(1) %out) #0 { - %tmp0 = call i64 @llvm.amdgcn.dispatch.id() - store i64 %tmp0, ptr addrspace(1) %out - ret void -} -; GCN-LABEL: {{^}}dispatch_id_opt0: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s8 -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s9 -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] -; GCN: .amdhsa_user_sgpr_dispatch_id 1 -define amdgpu_kernel void @dispatch_id_opt0(ptr addrspace(1) %out) #2 { - %tmp0 = call i64 @llvm.amdgcn.dispatch.id() - store i64 %tmp0, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}dispatch_id_opt1: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] ; GCN: .amdhsa_user_sgpr_dispatch_id 1 -define amdgpu_kernel void @dispatch_id_opt1(ptr addrspace(1) %out) #3 { +define amdgpu_kernel void @dispatch_id(ptr addrspace(1) %out) #0 { %tmp0 = call i64 @llvm.amdgcn.dispatch.id() store i64 %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dispatch_id_opt2: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s4 -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s5 -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] -; GCN: .amdhsa_user_sgpr_dispatch_id 1 -define amdgpu_kernel void @dispatch_id_opt2() #4 { - %tmp0 = call i64 @llvm.amdgcn.dispatch.id() - store i64 %tmp0, ptr addrspace(1) null - ret void -} - attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -attributes #2 = { "amdgpu-no-dispatch-ptr" } -attributes #3 = { "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } -attributes #4 = { "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-implicitarg-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll index dcbfef0acadca5..95e50da8a4709b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll @@ -17,5 +17,3 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { convergent inaccessiblememonly nounwind } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; MIR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll index 18c711d0b2aecc..3b64a8707b55e8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll @@ -59,7 +59,6 @@ define amdgpu_kernel void @one_f32() #0 { define amdgpu_kernel void @id_i32() #0 { ; GFX11-LABEL: id_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 m0, 0 ; GFX11-NEXT: exp pos0 v0, off, off, off done row_en ; GFX11-NEXT: s_endpgm @@ -71,8 +70,7 @@ define amdgpu_kernel void @id_i32() #0 { define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { ; GFX11-LABEL: id_arg_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 m0, s0 ; GFX11-NEXT: exp pos0 v0, off, off, off done row_en @@ -86,19 +84,16 @@ define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { define amdgpu_kernel void @id_row_i32() #0 { ; GFX11-SDAG-LABEL: id_row_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: s_mov_b32 m0, s0 ; GFX11-SDAG-NEXT: exp pos0 v0, off, off, off done row_en ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: id_row_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x63 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 m0, v0 ; GFX11-GISEL-NEXT: exp pos0 v1, off, off, off done row_en ; GFX11-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index a26b84e17374af..17b941c59fd3f3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -15,7 +15,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -27,7 +27,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -37,7 +37,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -50,7 +50,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -88,7 +88,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -101,7 +101,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -126,7 +126,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -136,7 +136,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -150,10 +150,10 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -164,23 +164,23 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -191,13 +191,13 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -208,10 +208,10 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -222,23 +222,23 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -249,13 +249,13 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -266,10 +266,10 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -280,23 +280,23 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -307,13 +307,13 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -324,10 +324,10 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -338,23 +338,23 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -365,13 +365,13 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -382,10 +382,10 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -396,23 +396,23 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -423,13 +423,13 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -440,10 +440,10 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -454,23 +454,23 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -481,13 +481,13 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -498,10 +498,10 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -512,23 +512,23 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -539,13 +539,13 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -556,10 +556,10 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -570,23 +570,23 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -597,13 +597,13 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -614,10 +614,10 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -628,23 +628,23 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -655,13 +655,13 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -672,10 +672,10 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -686,23 +686,23 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -713,13 +713,13 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -730,10 +730,10 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -744,23 +744,23 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -771,13 +771,13 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -788,10 +788,10 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -802,23 +802,23 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -829,13 +829,13 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -846,10 +846,10 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -860,23 +860,23 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -887,13 +887,13 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -904,10 +904,10 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -918,23 +918,23 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -945,13 +945,13 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13) store i32 %result, ptr addrspace(1) %out @@ -961,7 +961,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -973,7 +973,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -983,7 +983,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -996,7 +996,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_one: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1023,7 +1023,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_one: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_one: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_one: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1083,7 +1083,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1111,7 +1111,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1146,7 +1146,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_olt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_olt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_olt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1196,7 +1196,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_olt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1211,7 +1211,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ole: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ole: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ole: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1246,7 +1246,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ole: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1261,7 +1261,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1273,7 +1273,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1283,7 +1283,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1296,7 +1296,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1311,7 +1311,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_o: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1323,7 +1323,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_o: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_o: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_o: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1361,7 +1361,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uo: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1373,7 +1373,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uo: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1383,7 +1383,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uo: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1396,7 +1396,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uo: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1411,7 +1411,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_une: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1423,7 +1423,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_une: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1433,7 +1433,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_une: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_une: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1461,7 +1461,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1473,7 +1473,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1496,7 +1496,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1523,7 +1523,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1533,7 +1533,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1546,7 +1546,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1611,7 +1611,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1623,7 +1623,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1646,7 +1646,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1663,12 +1663,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_nop 0 @@ -1678,26 +1678,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_nop 0 @@ -1707,14 +1707,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %temp = call half @llvm.fabs.f16(half %a) %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half %temp, i32 1) @@ -1727,12 +1727,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_nop 0 @@ -1742,26 +1742,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_nop 0 @@ -1771,14 +1771,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %temp = call half @llvm.fabs.f16(half %a) %src_input = call half @llvm.fabs.f16(half %src) @@ -1798,7 +1798,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1808,7 +1808,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1823,10 +1823,10 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1837,23 +1837,23 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1864,13 +1864,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -1882,10 +1882,10 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1896,23 +1896,23 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1923,13 +1923,13 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -1941,10 +1941,10 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1955,23 +1955,23 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1982,13 +1982,13 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -2000,10 +2000,10 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2014,23 +2014,23 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2041,13 +2041,13 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -2059,10 +2059,10 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2073,23 +2073,23 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2100,13 +2100,13 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -2118,10 +2118,10 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2132,23 +2132,23 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2159,13 +2159,13 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -2177,10 +2177,10 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2191,23 +2191,23 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2218,13 +2218,13 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -2236,10 +2236,10 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2250,23 +2250,23 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2277,13 +2277,13 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -2295,10 +2295,10 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2309,23 +2309,23 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2336,13 +2336,13 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -2354,10 +2354,10 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2368,23 +2368,23 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2395,13 +2395,13 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -2413,10 +2413,10 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2427,23 +2427,23 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2454,13 +2454,13 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -2471,10 +2471,10 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2485,23 +2485,23 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2512,13 +2512,13 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -2529,10 +2529,10 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2543,23 +2543,23 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2570,13 +2570,13 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -2587,10 +2587,10 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2601,23 +2601,23 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2628,13 +2628,13 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13) store i32 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index 7e78d8b05d09f6..ce055d65279966 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -16,7 +16,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -30,7 +30,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -74,7 +74,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |s3| @@ -88,7 +88,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -113,7 +113,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -137,7 +137,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -163,7 +163,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -178,11 +178,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -193,24 +193,24 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -220,11 +220,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -240,11 +240,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -255,24 +255,24 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -282,11 +282,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -302,11 +302,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -317,24 +317,24 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -344,11 +344,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -364,11 +364,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -379,24 +379,24 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -406,11 +406,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -426,11 +426,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -441,24 +441,24 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -468,11 +468,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -488,11 +488,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -503,24 +503,24 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -530,11 +530,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -550,11 +550,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -565,24 +565,24 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_o_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -592,11 +592,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -612,11 +612,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -627,24 +627,24 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_u_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -654,11 +654,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -674,11 +674,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -689,24 +689,24 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -716,11 +716,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -736,11 +736,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -751,24 +751,24 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -778,11 +778,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -798,11 +798,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -813,24 +813,24 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nle_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -840,11 +840,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -860,11 +860,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -875,24 +875,24 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -902,11 +902,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -922,11 +922,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -937,24 +937,24 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nge_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -964,11 +964,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -984,11 +984,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -999,24 +999,24 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1026,11 +1026,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1045,7 +1045,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oeq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1059,7 +1059,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1086,7 +1086,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1105,7 +1105,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_one: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1119,7 +1119,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1146,7 +1146,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1165,7 +1165,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ogt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1179,7 +1179,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1192,7 +1192,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1206,7 +1206,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1225,7 +1225,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1252,7 +1252,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_olt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1299,7 +1299,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1326,7 +1326,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1345,7 +1345,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ole: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1359,7 +1359,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1386,7 +1386,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ueq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1419,7 +1419,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1465,7 +1465,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_o: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1492,7 +1492,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1506,7 +1506,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1525,7 +1525,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1539,7 +1539,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1585,7 +1585,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_une: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1599,7 +1599,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1612,7 +1612,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1626,7 +1626,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,7 +1645,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1672,7 +1672,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,7 +1686,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1705,7 +1705,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1719,7 +1719,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1765,7 +1765,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1779,7 +1779,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1792,7 +1792,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,7 +1806,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1825,7 +1825,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1839,7 +1839,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1866,7 +1866,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1887,13 +1887,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |s2| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |s3| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1904,26 +1904,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; GFX9-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1933,12 +1933,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1956,13 +1956,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |s2| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |s3| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1973,26 +1973,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2002,12 +2002,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2028,7 +2028,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2042,7 +2042,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f16: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -2054,7 +2054,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -2070,11 +2070,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2085,24 +2085,24 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2112,11 +2112,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2133,11 +2133,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2148,24 +2148,24 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2175,11 +2175,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2196,11 +2196,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2211,24 +2211,24 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2238,11 +2238,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2259,11 +2259,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2274,24 +2274,24 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2301,11 +2301,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2322,11 +2322,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2337,24 +2337,24 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2364,11 +2364,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2385,11 +2385,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2400,24 +2400,24 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2427,11 +2427,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2448,11 +2448,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2463,24 +2463,24 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlg_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2490,11 +2490,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2511,11 +2511,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2526,24 +2526,24 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2553,11 +2553,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2574,11 +2574,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2589,24 +2589,24 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nle_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2616,11 +2616,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2637,11 +2637,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2652,24 +2652,24 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2679,11 +2679,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2700,11 +2700,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2715,24 +2715,24 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nge_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2742,11 +2742,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2762,11 +2762,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2777,24 +2777,24 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_o_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2804,11 +2804,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2824,11 +2824,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2839,24 +2839,24 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_u_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2866,11 +2866,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2886,11 +2886,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2901,24 +2901,24 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ngt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2928,11 +2928,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index 78d5da8dda177b..ca06a57be19ccd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -8,7 +8,7 @@ declare bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> %a, <2 x bfloat> %b, bf define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] @@ -34,17 +34,18 @@ entry: } define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp( -; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: scratch_load_b32 v0, off, s2 -; GFX11-NEXT: scratch_load_u16 v1, off, s3 -; GFX11-NEXT: scratch_load_b32 v2, off, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11-NEXT: scratch_store_b16 off, v0, s0 -; GFX11-NEXT: s_endpgm +; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: +; SDAG-GFX11: ; %bb.0: ; %entry +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2 +; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3 +; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s1 +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s0 +; SDAG-GFX11-NEXT: s_endpgm +; ; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: ; GISEL-GFX11: ; %bb.0: ; %entry ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 @@ -94,5 +95,3 @@ entry: } declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; SDAG-GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll index 1343f25ec275e5..99c3deaada8c6b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -7,7 +7,7 @@ declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c) define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] @@ -35,7 +35,7 @@ entry: define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp: ; SDAG-GFX11: ; %bb.0: ; %entry -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2 ; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3 @@ -47,7 +47,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; ; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp: ; GISEL-GFX11: ; %bb.0: ; %entry -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s1 ; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 8a8b0490e9480b..e51b1d2da2e414 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -7,7 +7,7 @@ declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, floa define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll index e74485142fb6f0..d318bc80e49760 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll @@ -107,4 +107,4 @@ declare float @llvm.amdgcn.fmul.legacy(float, float) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "denormal-fp-math"="preserve-sign" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #2 = { nounwind "denormal-fp-math"="preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll index f631a0bfc28eb0..434fa1bf7b340b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll @@ -7,7 +7,7 @@ declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64) define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) %addr, i64 %in) { ; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -16,7 +16,7 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) ; ; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_no_rtn: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -31,12 +31,11 @@ entry: define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %addr, i64 %in, ptr addrspace(1) %use) { ; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_rtn: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 ; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -47,8 +46,8 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a ; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_rtn: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index 291c249e4b7384..f6197e0770213c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -8,7 +8,7 @@ declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 @@ -27,7 +27,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index 12742f4f7127b8..a2dc3662fcc485 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -8,7 +8,7 @@ declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 @@ -27,7 +27,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 9e3e393d82e223..309fd99031155d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -22,10 +22,10 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -36,23 +36,23 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -63,13 +63,13 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -87,7 +87,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -97,7 +97,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -111,10 +111,10 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -125,23 +125,23 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -152,13 +152,13 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -169,10 +169,10 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -183,23 +183,23 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -210,13 +210,13 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -227,10 +227,10 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -241,23 +241,23 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -268,13 +268,13 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -285,10 +285,10 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -299,23 +299,23 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -326,13 +326,13 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -343,10 +343,10 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -357,23 +357,23 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -384,13 +384,13 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -401,10 +401,10 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i32_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -415,23 +415,23 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i32_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -442,13 +442,13 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i32_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -459,10 +459,10 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -473,23 +473,23 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -500,13 +500,13 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -517,10 +517,10 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -531,23 +531,23 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -558,13 +558,13 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -575,10 +575,10 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -589,23 +589,23 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -616,13 +616,13 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41) store i32 %result, ptr addrspace(1) %out @@ -632,7 +632,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_eq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -644,7 +644,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_eq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -654,7 +654,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_eq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -667,7 +667,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_eq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -682,7 +682,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_ne: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -694,7 +694,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_ne: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -704,7 +704,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_ne: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -717,7 +717,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_ne: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -732,7 +732,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -744,7 +744,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -754,7 +754,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -767,7 +767,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -782,7 +782,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -794,7 +794,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -804,7 +804,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -817,7 +817,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -832,7 +832,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -844,7 +844,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -854,7 +854,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -867,7 +867,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -882,7 +882,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -894,7 +894,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -904,7 +904,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -932,7 +932,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sgt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -944,7 +944,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sgt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -954,7 +954,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sgt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -967,7 +967,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sgt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -982,7 +982,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -994,7 +994,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1017,7 +1017,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_slt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_slt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_slt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1067,7 +1067,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_slt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sle: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sle: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1104,7 +1104,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sle: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1117,7 +1117,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sle: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1133,10 +1133,10 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1147,23 +1147,23 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1174,13 +1174,13 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -1198,7 +1198,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1222,10 +1222,10 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1236,23 +1236,23 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1263,13 +1263,13 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -1280,10 +1280,10 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1294,23 +1294,23 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1321,13 +1321,13 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -1338,10 +1338,10 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1352,23 +1352,23 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1379,13 +1379,13 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -1396,10 +1396,10 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1410,23 +1410,23 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1437,13 +1437,13 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -1454,10 +1454,10 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1468,23 +1468,23 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1495,13 +1495,13 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -1512,10 +1512,10 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i16_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1526,23 +1526,23 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i16_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1553,13 +1553,13 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i16_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -1570,10 +1570,10 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1584,23 +1584,23 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1611,13 +1611,13 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -1628,10 +1628,10 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1642,23 +1642,23 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1669,13 +1669,13 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -1686,10 +1686,10 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1700,23 +1700,23 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1727,13 +1727,13 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41) store i32 %result, ptr addrspace(1) %out @@ -1743,7 +1743,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 @@ -1759,7 +1759,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX10-LABEL: v_icmp_i1_ne0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_gt_u32 s2, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index 60e242bf5b0e8f..5f979e0177f588 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -25,11 +25,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -40,11 +40,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -54,24 +54,24 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -98,7 +98,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -108,7 +108,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -117,7 +117,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i32: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -131,11 +131,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -146,11 +146,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -160,24 +160,24 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -193,11 +193,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -208,11 +208,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -222,24 +222,24 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -255,11 +255,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -270,11 +270,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -284,24 +284,24 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -317,11 +317,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -332,11 +332,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -346,24 +346,24 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -379,11 +379,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -394,11 +394,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -408,24 +408,24 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -441,11 +441,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GFX11-LABEL: v_icmp_i32_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -456,11 +456,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i32_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -470,24 +470,24 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; GFX9-LABEL: v_icmp_i32_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -503,11 +503,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -518,11 +518,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -532,24 +532,24 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -565,11 +565,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -580,11 +580,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -594,24 +594,24 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -627,11 +627,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -642,11 +642,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -656,24 +656,24 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -688,7 +688,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_eq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3] @@ -702,7 +702,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,7 +716,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -729,7 +729,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -748,7 +748,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_ne: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3] @@ -762,7 +762,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -776,7 +776,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -789,7 +789,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -808,7 +808,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3] @@ -822,7 +822,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -836,7 +836,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3] @@ -882,7 +882,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -896,7 +896,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -909,7 +909,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -928,7 +928,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3] @@ -942,7 +942,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -956,7 +956,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -969,7 +969,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -988,7 +988,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3] @@ -1002,7 +1002,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1029,7 +1029,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sgt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1076,7 +1076,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1089,7 +1089,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1108,7 +1108,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3] @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1136,7 +1136,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_slt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,7 +1196,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1209,7 +1209,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1228,7 +1228,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sle: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3] @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1269,7 +1269,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1289,11 +1289,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1304,11 +1304,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1318,24 +1318,24 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1362,7 +1362,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i16: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -1395,11 +1395,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1410,11 +1410,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1424,24 +1424,24 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ne_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1457,11 +1457,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1472,11 +1472,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1486,24 +1486,24 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1519,11 +1519,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1534,11 +1534,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1548,24 +1548,24 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1581,11 +1581,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1596,11 +1596,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1610,24 +1610,24 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1643,11 +1643,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1658,11 +1658,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1672,24 +1672,24 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1705,11 +1705,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GFX11-LABEL: v_icmp_i16_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1720,11 +1720,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i16_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1734,24 +1734,24 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; GFX9-LABEL: v_icmp_i16_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1767,11 +1767,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1782,11 +1782,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1796,24 +1796,24 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1829,11 +1829,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1844,11 +1844,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1858,24 +1858,24 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1891,11 +1891,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1906,11 +1906,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1920,24 +1920,24 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1952,7 +1952,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 @@ -1970,7 +1970,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: v_icmp_i1_ne0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_gt_u32 s2, 1 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -1986,7 +1986,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX9-LABEL: v_icmp_i1_ne0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_gt_u32 s2, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 3168e05b816bee..dba67a03c000e5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -14,9 +14,8 @@ entry: define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -152,11 +151,11 @@ entry: define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 @@ -178,7 +177,6 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 ; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 -; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll index f7f72ae31cc1db..70eff494501532 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -310,10 +310,10 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32> declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #2 declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2 -attributes #0 = { nounwind noinline "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } -attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } +attributes #0 = { nounwind noinline } +attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" } attributes #2 = { nounwind readnone speculatable } -attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } +attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 2d01703c78d78d..f1a4fe0f090b16 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -163,7 +163,7 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 @@ -189,7 +189,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 @@ -215,13 +215,11 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -230,8 +228,8 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -262,7 +260,7 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 @@ -285,7 +283,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 @@ -308,22 +306,21 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 -; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -352,9 +349,9 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -378,9 +375,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 @@ -404,16 +401,15 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 ; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0 +; GFX11-NEXT: v_mov_b32_e32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -447,9 +443,9 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -470,9 +466,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -493,23 +489,21 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 -; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0 +; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 ; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 -; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX11-NEXT: flat_load_b32 v8, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 0076079ce17c77..bc10eb68d75cbb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}is_private_vgpr: ; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] -; CI-DAG: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-DAG: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CIT: v_cmp_eq_u32_e32 vcc, s4, v[[PTR_HI]] ; CIH: v_cmp_eq_u32_e32 vcc, s2, v[[PTR_HI]] @@ -26,10 +26,10 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; select and vcc branch. ; GCN-LABEL: {{^}}is_private_sgpr: -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x1{{$}} +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x1{{$}} -; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x32{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} +; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x32{{$}} +; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} ; CI: s_cmp_eq_u32 [[APERTURE]], [[PTR_HI]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index e24c47991fe3d7..aad4d924952fff 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}is_local_vgpr: ; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] -; CI-DAG: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-DAG: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base ; GFX9: v_cmp_eq_u32_e32 vcc, s[[HI]], v[[PTR_HI]] @@ -26,10 +26,10 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; select and vcc branch. ; GCN-LABEL: {{^}}is_local_sgpr: -; CI-DAG: s_load_dword s0, s[6:7], 0x1 +; CI-DAG: s_load_dword s0, s[4:5], 0x1 -; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x33{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} +; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x33{{$}} +; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} ; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base ; GFX9: s_cmp_eq_u32 [[PTR_HI]], s[[HI]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll index ee005eb6e98410..8dba22312ac88c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -4,9 +4,9 @@ ; ALL-LABEL: {{^}}test: ; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1 -; CO-V4: s_load_dword s{{[0-9]+}}, s[8:9], 0xa +; CO-V4: s_load_dword s{{[0-9]+}}, s[4:5], 0xa -; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[4:5], 0xa +; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa ; HSA: .amdhsa_kernarg_size 8 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 1 @@ -23,7 +23,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out) #1 { ; OS-MESA3D: kernarg_segment_alignment = 4 ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15 -; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[4:5], 0x15 +; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15 ; HSA: .amdhsa_kernarg_size 8 define amdgpu_kernel void @test_implicit(ptr addrspace(1) %out) #1 { %implicitarg.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -78,7 +78,7 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(ptr addrspace(1) %out, ; HSA: .amdhsa_kernarg_size 0 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -define amdgpu_kernel void @test_no_kernargs() #4 { +define amdgpu_kernel void @test_no_kernargs() #1 { %kernarg.segment.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() %gep = getelementptr i32, ptr addrspace(4) %kernarg.segment.ptr, i64 10 %value = load i32, ptr addrspace(4) %gep @@ -123,7 +123,6 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind "amdgpu-implicitarg-num-bytes"="0" } attributes #2 = { nounwind "amdgpu-implicitarg-num-bytes"="48" } attributes #3 = { nounwind "amdgpu-implicitarg-num-bytes"="38" } -attributes #4 = { nounwind "amdgpu-implicitarg-num-bytes"="0" "amdgpu-no-implicitarg-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index c201f84cac7268..61818dafd2b84c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -23,8 +23,8 @@ define void @function_lds_id(ptr addrspace(1) %out) { define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: kernel_lds_id: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s2, s10, 42 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_add_i32 s2, s6, 42 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -42,27 +42,21 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l ; GCN-LABEL: indirect_lds_id: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s8, s6, 8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_addc_u32 s9, s7, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, function_lds_id@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, function_lds_id@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 21 +; GCN-NEXT: s_mov_b32 s12, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm call void @function_lds_id(ptr addrspace(1) %out) @@ -72,7 +66,7 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: doesnt_use_it: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 8e9a652ae8a8ef..1ae22c3eec185b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -15,20 +15,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -41,8 +41,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX12-LABEL: v_permlane16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -60,20 +60,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float ; GFX10-LABEL: v_permlane16_b32_vss_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -86,8 +86,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float ; GFX12-LABEL: v_permlane16_b32_vss_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -105,36 +105,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -149,8 +149,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -165,8 +165,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -181,8 +181,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -202,36 +202,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -246,8 +246,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -262,8 +262,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -278,8 +278,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -299,22 +299,22 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vii_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vii_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -324,7 +324,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-LABEL: v_permlane16_b32_vii_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -342,22 +342,22 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; GFX10-LABEL: v_permlane16_b32_vii_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vii_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -367,7 +367,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; ; GFX12-LABEL: v_permlane16_b32_vii_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -384,7 +384,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -396,7 +396,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -422,7 +422,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -436,7 +436,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -450,7 +450,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -469,7 +469,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -481,7 +481,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -493,7 +493,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -507,7 +507,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -521,7 +521,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -535,7 +535,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -556,25 +556,25 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vll_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vll_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -583,7 +583,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-LABEL: v_permlane16_b32_vll_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -601,7 +601,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -614,7 +614,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -627,7 +627,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -643,7 +643,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -659,7 +659,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -675,7 +675,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -697,25 +697,25 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; GFX10-LABEL: v_permlane16_b32_vll_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vll_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -724,7 +724,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; ; GFX12-LABEL: v_permlane16_b32_vll_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -742,7 +742,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -755,7 +755,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -768,7 +768,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -784,7 +784,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -800,7 +800,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -816,7 +816,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -838,33 +838,33 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vvv_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -873,17 +873,17 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -891,7 +891,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -909,7 +909,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -933,7 +933,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -947,7 +947,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -961,7 +961,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-LABEL: v_permlane16_b32_vvv_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -981,7 +981,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-LABEL: v_permlane16_b32_vvv_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -1009,33 +1009,33 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX10-LABEL: v_permlane16_b32_vvv_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1044,17 +1044,17 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -1080,7 +1080,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -1104,7 +1104,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -1118,7 +1118,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; ; GFX11-LABEL: v_permlane16_b32_vvv_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; ; GFX12-LABEL: v_permlane16_b32_vvv_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -1179,7 +1179,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1201,12 +1201,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -1215,12 +1215,11 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -1229,12 +1228,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -1243,12 +1242,11 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -1264,70 +1262,102 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s2 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s2 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vvs_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vvs_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -1337,7 +1367,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -1348,7 +1378,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1359,12 +1389,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -1373,12 +1403,11 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -1387,12 +1416,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -1401,12 +1430,11 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -1422,70 +1450,102 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s2 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s2 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vvs_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vvs_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -1495,7 +1555,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -1506,7 +1566,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1517,7 +1577,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1532,7 +1592,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1547,7 +1607,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1562,7 +1622,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1584,38 +1644,40 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1633,8 +1695,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1652,8 +1714,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1671,8 +1733,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1695,7 +1757,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -1706,7 +1768,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1717,7 +1779,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1732,7 +1794,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1747,7 +1809,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1762,7 +1824,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1784,38 +1846,40 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1833,8 +1897,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1852,8 +1916,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1871,8 +1935,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1896,20 +1960,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3 ; GFX10-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1922,8 +1986,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3 ; GFX12-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1941,36 +2005,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -1985,8 +2049,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2001,8 +2065,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2017,8 +2081,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2038,20 +2102,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl ; GFX10-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2064,8 +2128,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl ; GFX12-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2083,36 +2147,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2127,8 +2191,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2143,8 +2207,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2159,8 +2223,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2180,20 +2244,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3 ; GFX10-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2206,8 +2270,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3 ; GFX12-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2225,36 +2289,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2269,8 +2333,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2285,8 +2349,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2301,8 +2365,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2322,20 +2386,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl ; GFX10-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2348,8 +2412,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl ; GFX12-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2367,36 +2431,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2411,8 +2475,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2427,8 +2491,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2443,8 +2507,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2464,20 +2528,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2490,8 +2554,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2509,36 +2573,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2553,8 +2617,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2569,8 +2633,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2585,8 +2649,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2606,20 +2670,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2632,8 +2696,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2651,36 +2715,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2695,8 +2759,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2711,8 +2775,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2727,8 +2791,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2748,20 +2812,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2774,8 +2838,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 ; GFX12-LABEL: v_permlanex16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2793,20 +2857,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vss_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2819,8 +2883,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa ; GFX12-LABEL: v_permlanex16_b32_vss_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2838,36 +2902,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2882,8 +2946,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2898,8 +2962,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2914,8 +2978,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2935,36 +2999,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2979,8 +3043,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2995,8 +3059,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -3011,8 +3075,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3032,22 +3096,22 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vii_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vii_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -3057,7 +3121,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; ; GFX12-LABEL: v_permlanex16_b32_vii_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3075,22 +3139,22 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vii_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vii_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -3100,7 +3164,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; ; GFX12-LABEL: v_permlanex16_b32_vii_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3117,7 +3181,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3129,7 +3193,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3141,7 +3205,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3155,7 +3219,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3169,7 +3233,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3183,7 +3247,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3202,7 +3266,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3214,7 +3278,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3226,7 +3290,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3240,7 +3304,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3254,7 +3318,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3268,7 +3332,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3289,25 +3353,25 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vll_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vll_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3316,7 +3380,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; ; GFX12-LABEL: v_permlanex16_b32_vll_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -3335,25 +3399,25 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vll_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vll_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3362,7 +3426,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; ; GFX12-LABEL: v_permlanex16_b32_vll_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -3380,7 +3444,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3393,7 +3457,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3406,7 +3470,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3422,7 +3486,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3438,7 +3502,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3454,7 +3518,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3475,7 +3539,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3488,7 +3552,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3501,7 +3565,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3517,7 +3581,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3533,7 +3597,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3549,7 +3613,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3571,33 +3635,33 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vvv_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3606,17 +3670,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3624,7 +3688,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -3642,7 +3706,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -3667,33 +3731,33 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vvv_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3702,17 +3766,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3720,7 +3784,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -3738,7 +3802,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -3762,7 +3826,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -3776,7 +3840,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -3790,7 +3854,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; ; GFX11-LABEL: v_permlanex16_b32_vvv_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -3810,7 +3874,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; ; GFX12-LABEL: v_permlanex16_b32_vvv_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -3837,7 +3901,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -3851,7 +3915,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -3865,7 +3929,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; ; GFX11-LABEL: v_permlanex16_b32_vvv_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -3885,7 +3949,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; ; GFX12-LABEL: v_permlanex16_b32_vvv_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -3912,7 +3976,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -3923,7 +3987,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3934,12 +3998,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -3948,12 +4012,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -3962,12 +4025,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -3976,12 +4039,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -3996,7 +4058,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -4007,7 +4069,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4018,12 +4080,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -4032,12 +4094,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -4046,12 +4107,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -4060,12 +4121,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -4081,144 +4141,208 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s2 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s2 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vvs_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vvs_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) { +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s2 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s2 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vvs_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vvs_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -4228,7 +4352,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -4239,7 +4363,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4250,7 +4374,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4265,7 +4389,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4280,7 +4404,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4295,7 +4419,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4316,7 +4440,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -4327,7 +4451,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4338,7 +4462,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4353,7 +4477,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4368,7 +4492,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4383,7 +4507,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4405,38 +4529,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4454,8 +4580,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4473,8 +4599,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4492,8 +4618,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4517,38 +4643,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4566,8 +4694,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4585,8 +4713,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4604,8 +4732,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4629,20 +4757,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4655,8 +4783,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4674,20 +4802,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f ; GFX10-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4700,8 +4828,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f ; GFX12-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4719,36 +4847,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -4763,8 +4891,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4779,8 +4907,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -4795,8 +4923,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4816,36 +4944,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -4860,8 +4988,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4876,8 +5004,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -4892,8 +5020,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4913,20 +5041,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4939,8 +5067,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4958,20 +5086,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f ; GFX10-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4984,8 +5112,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f ; GFX12-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5003,36 +5131,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5047,8 +5175,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5063,8 +5191,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5079,8 +5207,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5100,36 +5228,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5144,8 +5272,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5160,8 +5288,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5176,8 +5304,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5197,20 +5325,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5223,8 +5351,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5242,20 +5370,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5268,8 +5396,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5287,36 +5415,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5331,8 +5459,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5347,8 +5475,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5363,8 +5491,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5384,36 +5512,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5428,8 +5556,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5444,8 +5572,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5460,8 +5588,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5481,24 +5609,23 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5506,13 +5633,12 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5526,24 +5652,23 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5551,13 +5676,12 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5572,41 +5696,40 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -5614,15 +5737,14 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -5630,15 +5752,14 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -5646,15 +5767,14 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -5670,12 +5790,12 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -5683,79 +5803,75 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -5771,24 +5887,23 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5796,13 +5911,12 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5817,24 +5931,23 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5842,13 +5955,12 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5864,39 +5976,38 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -5904,15 +6015,14 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -5920,15 +6030,14 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -5936,15 +6045,14 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -5961,12 +6069,12 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -5974,79 +6082,75 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6063,23 +6167,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -6087,14 +6191,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6102,15 +6205,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6118,14 +6220,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6133,15 +6234,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6155,23 +6255,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -6179,14 +6279,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6194,15 +6293,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6210,14 +6308,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6225,15 +6322,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6248,43 +6344,43 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6292,15 +6388,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6308,15 +6404,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6324,15 +6420,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6348,14 +6444,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -6363,85 +6459,81 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6457,24 +6549,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6482,13 +6573,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6503,24 +6593,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6528,13 +6617,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6550,39 +6638,38 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6590,15 +6677,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6606,15 +6692,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6622,15 +6707,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6647,12 +6731,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -6660,79 +6744,75 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6749,24 +6829,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6774,13 +6853,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6795,24 +6873,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6820,13 +6897,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6842,39 +6918,38 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6882,15 +6957,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6898,15 +6972,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6914,15 +6987,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6939,12 +7011,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -6952,79 +7024,75 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7041,24 +7109,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7066,13 +7133,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7087,24 +7153,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7112,13 +7177,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7134,39 +7198,38 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7174,15 +7237,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -7190,15 +7252,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -7206,15 +7267,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7231,12 +7291,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -7244,79 +7304,75 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7333,24 +7389,23 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7358,13 +7413,12 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7378,24 +7432,23 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7403,13 +7456,12 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7424,41 +7476,40 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7466,15 +7517,14 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -7482,15 +7532,14 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -7498,15 +7547,14 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7522,12 +7570,12 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -7535,79 +7583,75 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7623,24 +7667,23 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7648,13 +7691,12 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7669,24 +7711,23 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7694,13 +7735,12 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7716,39 +7756,38 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7756,15 +7795,14 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -7772,15 +7810,14 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -7788,15 +7825,14 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7813,12 +7849,12 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -7826,79 +7862,75 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7915,23 +7947,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -7939,14 +7971,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7954,15 +7985,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -7970,14 +8000,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -7985,15 +8014,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8007,23 +8035,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -8031,14 +8059,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8046,15 +8073,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8062,14 +8088,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8077,15 +8102,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8100,43 +8124,43 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8144,15 +8168,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8160,15 +8184,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8176,15 +8200,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8200,14 +8224,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -8215,85 +8239,81 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8309,24 +8329,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8334,13 +8353,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8355,24 +8373,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8380,13 +8397,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8402,39 +8418,38 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8442,15 +8457,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8458,15 +8472,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8474,15 +8487,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8499,12 +8511,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -8512,79 +8524,75 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8601,24 +8609,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8626,13 +8633,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8647,24 +8653,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8672,13 +8677,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8694,39 +8698,38 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8734,15 +8737,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8750,15 +8752,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8766,15 +8767,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8791,12 +8791,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -8804,79 +8804,75 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8893,24 +8889,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8918,13 +8913,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8939,24 +8933,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8964,13 +8957,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8986,39 +8978,38 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -9026,15 +9017,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -9042,15 +9032,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -9058,15 +9047,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -9083,12 +9071,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -9096,79 +9084,75 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll index a65143255bbb4e..973678291e2632 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -41,7 +41,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -54,7 +54,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -72,7 +72,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -85,7 +85,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -103,8 +103,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -116,9 +115,9 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -135,7 +134,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -148,7 +147,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -166,7 +165,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -179,7 +178,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -197,7 +196,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -210,7 +209,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -228,7 +227,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -241,7 +240,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -259,7 +258,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -272,7 +271,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -290,7 +289,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -303,7 +302,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -321,8 +320,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -334,9 +332,9 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -353,7 +351,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -366,7 +364,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -384,7 +382,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -397,7 +395,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -415,7 +413,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -428,7 +426,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -447,11 +445,10 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -462,10 +459,10 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -483,11 +480,10 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -498,10 +494,10 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -520,11 +516,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -535,12 +531,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -558,11 +552,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -573,10 +566,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -595,11 +588,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -610,10 +602,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -632,11 +624,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -647,10 +638,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -669,11 +660,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -684,10 +674,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -705,11 +695,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -720,10 +709,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -742,11 +731,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -757,12 +746,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -780,11 +767,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -795,10 +781,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -817,11 +803,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -832,10 +817,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -854,11 +839,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -869,10 +853,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index abb2f877781879..f653baa7365c71 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -5,29 +5,114 @@ declare i32 @llvm.amdgcn.permlane64(i32) declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { -; GFX11-LABEL: test_s: +define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX11-LABEL: test_s_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64(i32 %src0) + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { -; GFX11-LABEL: test_i: +define amdgpu_kernel void @test_s_f32(ptr addrspace(1) %out, float %src0) { +; GFX11-LABEL: test_s_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane64_b32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane64.f32(float %src0) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX11-SDAG-LABEL: test_s_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) { +; GFX11-SDAG-LABEL: test_s_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane64.f64(double %src0) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) { +; GFX11-LABEL: test_i_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 @@ -36,16 +121,16 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64(i32 99) + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 99) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { -; GFX11-LABEL: test_v: +define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) { +; GFX11-LABEL: test_i_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -53,11 +138,314 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane64.f32(float 1234.5) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 99) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x40934a00 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane64.f64(double 1234.5) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 { +; GFX11-SDAG-LABEL: test_v_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx) + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %tidx) store i32 %v, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11-GISEL: {{.*}} -; GFX11-SDAG: {{.*}} + +define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 { +; GFX11-SDAG-LABEL: test_v_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlane64.f32(float %tidx_f32) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_i64(ptr addrspace(1) %out, i64 %src0) #1 { +; GFX11-LABEL: test_v_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_permlane64_b32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlane64_b32 v1, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %tidx_i64) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 { +; GFX11-SDAG-LABEL: test_v_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlane64.f64(double %tidx_f64) + store double %v, ptr addrspace(1) %out + ret void +} + +define void @test_half(ptr addrspace(1) %out, half %src0) { +; GFX11-LABEL: test_half: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call half @llvm.amdgcn.permlane64.f16(half %src0) + store half %v, ptr addrspace(1) %out + ret void +} + +define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) { +; GFX11-LABEL: test_bfloat: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0) + store bfloat %v, ptr addrspace(1) %out + ret void +} + +define void @test_i16(ptr addrspace(1) %out, i16 %src0) { +; GFX11-LABEL: test_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0) + store i16 %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2f16(ptr addrspace(1) %out, <2 x half> %src0) { +; GFX11-LABEL: test_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlane64.v2f16(<2 x half> %src0) + store <2 x half> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2f32(ptr addrspace(1) %out, <2 x float> %src0) { +; GFX11-SDAG-LABEL: test_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlane64.v2f32(<2 x float> %src0) + store <2 x float> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) { +; GFX11-SDAG-LABEL: test_v7i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 +; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 +; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v7i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 +; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 +; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlane64.v7i32(<7 x i32> %src0) + store <7 x i32> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v8i16(ptr addrspace(1) %out, <8 x i16> %src0) { +; GFX11-SDAG-LABEL: test_v8i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v8i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlane64.v8i16(<8 x i16> %src0) + store <8 x i16> %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll index afa3fe8c2f1fbd..2070a832e0fcd0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) { ; GFX11-SDAG-LABEL: test_p0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -24,13 +24,13 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { ; GFX11-SDAG-LABEL: test_v3p0: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x2 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x44 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x54 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s6 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s2 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v1 @@ -40,8 +40,8 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { ; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v0 ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v7 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[2:3] offset:16 -; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[2:3] +; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -54,10 +54,10 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0 ; GFX11-SDAG-LABEL: test_p3: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -73,8 +73,8 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3 ; GFX11-SDAG-LABEL: test_v3p3: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 @@ -97,10 +97,10 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0 ; GFX11-SDAG-LABEL: test_p5: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -116,8 +116,8 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5 ; GFX11-SDAG-LABEL: test_v3p5: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 @@ -140,10 +140,10 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0 ; GFX11-SDAG-LABEL: test_p6: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -159,8 +159,8 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6 ; GFX11-SDAG-LABEL: test_v3p6: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll index 7e16358f741819..36d23197887136 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll @@ -4,7 +4,7 @@ ; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target ; GCN-LABEL: {{^}}test: -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN: .amdhsa_user_sgpr_queue_ptr 1 define amdgpu_kernel void @test(ptr addrspace(1) %out) { %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 @@ -13,21 +13,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out) { ret void } -; FIXME: Should really be able to delete the load -; GCN-LABEL: {{^}}test_ub: -; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 -; GCN: .amdhsa_user_sgpr_queue_ptr 0 -define amdgpu_kernel void @test_ub(ptr addrspace(1) %out) #1 { - %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 - %value = load i32, ptr addrspace(4) %queue_ptr - store i32 %value, ptr addrspace(1) %out - ret void -} - declare noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 attributes #0 = { nounwind readnone } -attributes #1 = { "amdgpu-no-queue-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll index 9f0b420a0a828d..5d9daae69e7865 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -5,11 +5,7 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_ ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen offset:24 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 @@ -21,11 +17,7 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sg ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 +; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) @@ -36,11 +28,7 @@ define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffse ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -51,11 +39,7 @@ define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__ ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) @@ -66,11 +50,7 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_ ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen slc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll index 320b0b4508b6a5..9becefa33a8f24 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll @@ -10,7 +10,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen offset:128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 @@ -26,7 +26,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2) @@ -41,7 +41,7 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen offset:128 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 %unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -56,7 +56,7 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll index ce46e2755ae582..9ac6b6a1d0ff9d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll @@ -8,29 +8,21 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen scc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc1 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -41,7 +33,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen scope:SCOPE_SYS +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24) ret void @@ -51,29 +43,21 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 +; GFX908-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 +; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -84,7 +68,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 +; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void @@ -94,29 +78,21 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX908-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -127,7 +103,7 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -137,29 +113,21 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs ; GFX908-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -170,7 +138,7 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0) ret void @@ -180,29 +148,21 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen slc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -213,7 +173,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll index 327d80a7b67cdc..fc4449886d9541 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll @@ -7,18 +7,14 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen glc scc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc0 sc1 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -29,7 +25,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24) @@ -40,18 +36,14 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_ ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 glc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -62,7 +54,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) @@ -73,18 +65,14 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -95,7 +83,7 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -106,18 +94,14 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -128,7 +112,7 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0) @@ -139,18 +123,14 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen glc slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen sc0 nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -161,7 +141,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll index 3ecbe3c71d0222..3c800d0369e70c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll @@ -9,11 +9,7 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -21,33 +17,21 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -65,11 +49,7 @@ define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -78,33 +58,21 @@ define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -122,11 +90,7 @@ define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 @@ -137,33 +101,21 @@ define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -187,11 +139,7 @@ define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 @@ -206,33 +154,21 @@ define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll index cc1547eaad8309..4d557c76dc4d07 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -1180,22 +1180,14 @@ define double @buffer_load_f64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %v ; PREGFX10-LABEL: buffer_load_f64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_f64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1214,22 +1206,14 @@ define <2 x double> @buffer_load_v2f64__voffset_add(ptr addrspace(8) inreg %rsrc ; PREGFX10-LABEL: buffer_load_v2f64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2f64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1248,22 +1232,14 @@ define i64 @buffer_load_i64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voff ; PREGFX10-LABEL: buffer_load_i64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_i64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1282,22 +1258,14 @@ define <2 x i64> @buffer_load_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, i ; PREGFX10-LABEL: buffer_load_v2i64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2i64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1316,22 +1284,14 @@ define ptr @buffer_load_p0__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffs ; PREGFX10-LABEL: buffer_load_p0__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p0__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1350,22 +1310,14 @@ define <2 x ptr> @buffer_load_v2p0__voffset_add(ptr addrspace(8) inreg %rsrc, i3 ; PREGFX10-LABEL: buffer_load_v2p0__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p0__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1384,22 +1336,14 @@ define ptr addrspace(1) @buffer_load_p1__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p1__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p1__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1418,22 +1362,14 @@ define <2 x ptr addrspace(1)> @buffer_load_v2p1__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p1__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p1__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1452,22 +1388,14 @@ define ptr addrspace(4) @buffer_load_p4__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p4__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p4__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1486,22 +1414,14 @@ define <2 x ptr addrspace(4)> @buffer_load_v2p4__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p4__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p4__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1520,22 +1440,14 @@ define ptr addrspace(999) @buffer_load_p999__voffset_add(ptr addrspace(8) inreg ; PREGFX10-LABEL: buffer_load_p999__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p999__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1554,22 +1466,14 @@ define <2 x ptr addrspace(999)> @buffer_load_v2p999__voffset_add(ptr addrspace(8 ; PREGFX10-LABEL: buffer_load_v2p999__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p999__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1588,22 +1492,14 @@ define ptr addrspace(2) @buffer_load_p2__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1622,22 +1518,14 @@ define <2 x ptr addrspace(2)> @buffer_load_v2p2__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1656,11 +1544,7 @@ define <3 x ptr addrspace(2)> @buffer_load_v3p2__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1679,22 +1563,14 @@ define <4 x ptr addrspace(2)> @buffer_load_v4p2__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1713,22 +1589,14 @@ define ptr addrspace(3) @buffer_load_p3__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1747,22 +1615,14 @@ define <2 x ptr addrspace(3)> @buffer_load_v2p3__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1781,11 +1641,7 @@ define <3 x ptr addrspace(3)> @buffer_load_v3p3__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1804,22 +1660,14 @@ define <4 x ptr addrspace(3)> @buffer_load_v4p3__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1838,22 +1686,14 @@ define ptr addrspace(5) @buffer_load_p5__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,22 +1712,14 @@ define <2 x ptr addrspace(5)> @buffer_load_v2p5__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1906,11 +1738,7 @@ define <3 x ptr addrspace(5)> @buffer_load_v3p5__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1929,22 +1757,14 @@ define <4 x ptr addrspace(5)> @buffer_load_v4p5__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1963,22 +1783,14 @@ define ptr addrspace(6) @buffer_load_p6__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1997,22 +1809,14 @@ define <2 x ptr addrspace(6)> @buffer_load_v2p6__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2031,11 +1835,7 @@ define <3 x ptr addrspace(6)> @buffer_load_v3p6__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2054,22 +1854,14 @@ define <4 x ptr addrspace(6)> @buffer_load_v4p6__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll index d9227724c22a14..4fbb4ec342ff50 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -352,22 +352,14 @@ define void @buffer_store_f64__voffset_add(ptr addrspace(8) inreg %rsrc, double ; VERDE-LABEL: buffer_store_f64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_f64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -379,22 +371,14 @@ define void @buffer_store_v2f64__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2f64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2f64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -406,22 +390,14 @@ define void @buffer_store_i64__voffset_add(ptr addrspace(8) inreg %rsrc, i64 %da ; VERDE-LABEL: buffer_store_i64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_i64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -433,22 +409,14 @@ define void @buffer_store_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2i64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2i64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -460,22 +428,14 @@ define void @buffer_store_p0__voffset_add(ptr addrspace(8) inreg %rsrc, ptr %dat ; VERDE-LABEL: buffer_store_p0__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p0__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -487,22 +447,14 @@ define void @buffer_store_v2p0__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p0__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p0__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -514,22 +466,14 @@ define void @buffer_store_p1__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p1__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p1__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -541,22 +485,14 @@ define void @buffer_store_v2p1__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p1__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p1__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -568,22 +504,14 @@ define void @buffer_store_p4__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p4__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p4__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -595,22 +523,14 @@ define void @buffer_store_v2p4__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p4__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p4__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -622,22 +542,14 @@ define void @buffer_store_p999__voffset_add(ptr addrspace(8) inreg %rsrc, ptr ad ; VERDE-LABEL: buffer_store_p999__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p999__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -649,22 +561,14 @@ define void @buffer_store_v2p999__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2p999__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p999__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -676,22 +580,14 @@ define void @buffer_store_p2__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -703,22 +599,14 @@ define void @buffer_store_v2p2__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -730,22 +618,14 @@ define void @buffer_store_v3p2__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -757,22 +637,14 @@ define void @buffer_store_v4p2__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -784,22 +656,14 @@ define void @buffer_store_p3__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -811,22 +675,14 @@ define void @buffer_store_v2p3__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -838,22 +694,14 @@ define void @buffer_store_v3p3__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -865,22 +713,14 @@ define void @buffer_store_v4p3__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -892,22 +732,14 @@ define void @buffer_store_p5__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -919,22 +751,14 @@ define void @buffer_store_v2p5__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -946,22 +770,14 @@ define void @buffer_store_v3p5__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -973,22 +789,14 @@ define void @buffer_store_v4p5__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1000,22 +808,14 @@ define void @buffer_store_p6__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1027,22 +827,14 @@ define void @buffer_store_v2p6__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1054,22 +846,14 @@ define void @buffer_store_v3p6__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1081,22 +865,14 @@ define void @buffer_store_v4p6__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll index 30f04f1ff220cb..cb511c93f67ed5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] @@ -17,28 +17,28 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -53,8 +53,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff @@ -65,28 +65,28 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -101,8 +101,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 @@ -115,32 +115,32 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; PREGFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -158,8 +158,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff @@ -174,30 +174,30 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index a241bdeaff1a75..01df7634f0e9c7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] @@ -19,28 +19,28 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -51,8 +51,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; GFX12-PACKED-LABEL: tbuffer_store_d16_x: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] @@ -67,8 +67,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff @@ -79,28 +79,28 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -111,8 +111,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] @@ -127,8 +127,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 @@ -141,32 +141,32 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; PREGFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -179,8 +179,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-SDAG: ; %bb.0: ; %main_body ; GFX12-PACKED-SDAG-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4 @@ -193,8 +193,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body ; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4 ; GFX12-PACKED-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -213,8 +213,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff @@ -229,30 +229,30 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -264,8 +264,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index b061d53de5d3c5..cc6c630ae6466d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -151,7 +151,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -161,7 +161,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -187,7 +187,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -204,7 +204,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -215,7 +215,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -233,7 +233,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_m0: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -246,7 +246,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { ; ; CHECK-GISEL-LABEL: test_readfirstlane_m0: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -265,7 +265,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -297,7 +297,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -311,7 +311,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -331,7 +331,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -345,7 +345,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -365,7 +365,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_fi: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s15 +; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s9 ; CHECK-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-SDAG-NEXT: s_mov_b32 s4, 0 ; CHECK-SDAG-NEXT: ;;#ASMSTART @@ -375,7 +375,7 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { ; ; CHECK-GISEL-LABEL: test_readfirstlane_fi: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_add_u32 s0, s0, s15 +; CHECK-GISEL-NEXT: s_add_u32 s0, s0, s9 ; CHECK-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s4, 0 ; CHECK-GISEL-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 24a332fa211c15..66e1f9396de5af 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -9,7 +9,7 @@ declare double @llvm.amdgcn.readlane.f64(double, i32) #0 define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[0:1] @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[0:1] @@ -55,7 +55,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[0:1] @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[0:1] @@ -78,7 +78,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x4 +; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x4 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -91,7 +91,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s0, s[6:7], 0x4 +; CHECK-GISEL-NEXT: s_load_dword s0, s[4:5], 0x4 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -110,7 +110,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x8 +; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -124,7 +124,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s1, s[6:7], 0x8 +; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -144,7 +144,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x8 +; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -158,7 +158,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s1, s[6:7], 0x8 +; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -178,7 +178,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -188,7 +188,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -214,7 +214,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -231,7 +231,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -242,7 +242,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -260,7 +260,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -279,7 +279,7 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -309,7 +309,7 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -330,7 +330,7 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -363,7 +363,7 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -384,7 +384,7 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -418,7 +418,7 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_m0_sreg: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -431,7 +431,7 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; ; CHECK-GISEL-LABEL: test_readlane_m0_sreg: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -450,7 +450,7 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -464,7 +464,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -484,7 +484,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -500,7 +500,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -522,7 +522,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -538,7 +538,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -560,7 +560,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -573,7 +573,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -592,7 +592,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -606,7 +606,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -626,7 +626,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -640,7 +640,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index e2f494283a3f2e..f52461b6b38075 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -18,21 +18,21 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8DAGISEL-NEXT: s_endpgm ; ; GFX8GISEL-LABEL: uniform_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -40,54 +40,54 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: uniform_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9DAGISEL-NEXT: s_endpgm ; ; GFX9GISEL-LABEL: uniform_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9GISEL-NEXT: s_endpgm ; ; GFX10DAGISEL-LABEL: uniform_value: ; GFX10DAGISEL: ; %bb.0: ; %entry ; GFX10DAGISEL-NEXT: s_clause 0x1 -; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10DAGISEL-NEXT: s_endpgm ; ; GFX10GISEL-LABEL: uniform_value: ; GFX10GISEL: ; %bb.0: ; %entry ; GFX10GISEL-NEXT: s_clause 0x1 -; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: uniform_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1164DAGISEL-NEXT: s_nop 0 ; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -96,11 +96,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-LABEL: uniform_value: ; GFX1164GISEL: ; %bb.0: ; %entry ; GFX1164GISEL-NEXT: s_clause 0x1 -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1164GISEL-NEXT: s_nop 0 ; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -109,10 +109,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-LABEL: uniform_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1132DAGISEL-NEXT: s_nop 0 ; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -121,10 +121,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-LABEL: uniform_value: ; GFX1132GISEL: ; %bb.0: ; %entry ; GFX1132GISEL-NEXT: s_clause 0x1 -; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -138,7 +138,7 @@ entry: define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: const_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -148,7 +148,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: const_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -158,7 +158,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: const_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -167,7 +167,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: const_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -176,7 +176,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10DAGISEL-LABEL: const_value: ; GFX10DAGISEL: ; %bb.0: ; %entry -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -185,7 +185,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10GISEL-LABEL: const_value: ; GFX10GISEL: ; %bb.0: ; %entry -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -194,7 +194,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: const_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -205,7 +205,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: const_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -216,7 +216,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: const_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -226,7 +226,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: const_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -280,7 +280,7 @@ entry: define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -300,7 +300,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX8GISEL-LABEL: divergent_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, 0 ; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -320,7 +320,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: divergent_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 @@ -339,7 +339,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9GISEL-LABEL: divergent_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, 0 ; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -358,7 +358,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1064DAGISEL-LABEL: divergent_value: ; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 @@ -377,7 +377,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1064GISEL-LABEL: divergent_value: ; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -396,7 +396,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1032DAGISEL-LABEL: divergent_value: ; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 @@ -415,7 +415,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1032GISEL-LABEL: divergent_value: ; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -434,17 +434,15 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164DAGISEL-LABEL: divergent_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -458,16 +456,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -482,16 +478,15 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 ; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -505,16 +500,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -537,17 +530,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -562,8 +555,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -574,16 +567,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL: ; %bb.0: ; %entry ; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s6, s4 ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -596,8 +589,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8GISEL-NEXT: .LBB4_5: ; %endif -; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -609,17 +602,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -634,8 +627,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -645,16 +638,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL: ; %bb.0: ; %entry ; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s6, s4 ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -667,8 +660,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9GISEL-NEXT: .LBB4_5: ; %endif -; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -679,17 +672,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -704,8 +697,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -715,16 +708,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL: ; %bb.0: ; %entry ; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -737,8 +730,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064GISEL-NEXT: .LBB4_5: ; %endif -; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -748,34 +741,34 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-LABEL: divergent_cfg: ; GFX1032DAGISEL: ; %bb.0: ; %entry ; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, 0 ; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032DAGISEL-NEXT: s_max_u32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_max_u32 s3, s3, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -784,54 +777,52 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-LABEL: divergent_cfg: ; GFX1032GISEL: ; %bb.0: ; %entry ; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 -; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else -; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: s_mov_b32 s2, s2 ; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s3, s3 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032GISEL-NEXT: s_max_u32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_max_u32 s2, s2, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032GISEL-NEXT: .LBB4_5: ; %endif -; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1032GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -847,8 +838,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -858,20 +849,18 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_cfg: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -885,8 +874,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164GISEL-NEXT: .LBB4_5: ; %endif -; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -897,38 +886,36 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_cfg: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr3 ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s3, s[0:1], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, 0 ; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132DAGISEL-NEXT: s_max_u32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_max_u32 s3, s3, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -938,38 +925,36 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_cfg: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else -; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: s_mov_b32 s2, s2 ; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s3, s3 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132GISEL-NEXT: .LBB4_5: ; %endif -; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 5304188e02f84a..bfdb2da6dc6a41 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -19,21 +19,21 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8DAGISEL-NEXT: s_endpgm ; ; GFX8GISEL-LABEL: uniform_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -41,54 +41,54 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: uniform_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9DAGISEL-NEXT: s_endpgm ; ; GFX9GISEL-LABEL: uniform_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9GISEL-NEXT: s_endpgm ; ; GFX10DAGISEL-LABEL: uniform_value: ; GFX10DAGISEL: ; %bb.0: ; %entry ; GFX10DAGISEL-NEXT: s_clause 0x1 -; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10DAGISEL-NEXT: s_endpgm ; ; GFX10GISEL-LABEL: uniform_value: ; GFX10GISEL: ; %bb.0: ; %entry ; GFX10GISEL-NEXT: s_clause 0x1 -; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: uniform_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1164DAGISEL-NEXT: s_nop 0 ; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -97,11 +97,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-LABEL: uniform_value: ; GFX1164GISEL: ; %bb.0: ; %entry ; GFX1164GISEL-NEXT: s_clause 0x1 -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1164GISEL-NEXT: s_nop 0 ; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -110,10 +110,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-LABEL: uniform_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1132DAGISEL-NEXT: s_nop 0 ; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -122,10 +122,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-LABEL: uniform_value: ; GFX1132GISEL: ; %bb.0: ; %entry ; GFX1132GISEL-NEXT: s_clause 0x1 -; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -139,7 +139,7 @@ entry: define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: const_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -149,7 +149,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: const_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -159,7 +159,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: const_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -168,7 +168,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: const_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -177,7 +177,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10DAGISEL-LABEL: const_value: ; GFX10DAGISEL: ; %bb.0: ; %entry -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10GISEL-LABEL: const_value: ; GFX10GISEL: ; %bb.0: ; %entry -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -195,7 +195,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: const_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -206,7 +206,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: const_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -217,7 +217,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: const_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -227,7 +227,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: const_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -281,7 +281,7 @@ entry: define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: divergent_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -301,7 +301,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: divergent_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, -1 ; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -321,7 +321,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: divergent_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1 @@ -340,7 +340,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: divergent_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, -1 ; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -359,7 +359,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1064DAGISEL-LABEL: divergent_value: ; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1 @@ -378,7 +378,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1064GISEL-LABEL: divergent_value: ; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, -1 ; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -397,7 +397,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1032DAGISEL-LABEL: divergent_value: ; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1 @@ -416,7 +416,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1032GISEL-LABEL: divergent_value: ; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, -1 ; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -435,17 +435,15 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: divergent_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -459,16 +457,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: divergent_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, -1 ; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -483,16 +479,15 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: divergent_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1 ; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -506,16 +501,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: divergent_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, -1 ; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -538,17 +531,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -563,8 +556,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -575,16 +568,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL: ; %bb.0: ; %entry ; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s6, s4 ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -597,8 +590,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8GISEL-NEXT: .LBB4_5: ; %endif -; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -610,17 +603,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -635,8 +628,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -646,16 +639,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL: ; %bb.0: ; %entry ; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s6, s4 ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -668,8 +661,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9GISEL-NEXT: .LBB4_5: ; %endif -; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -680,17 +673,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -705,8 +698,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -716,16 +709,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL: ; %bb.0: ; %entry ; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -738,8 +731,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064GISEL-NEXT: .LBB4_5: ; %endif -; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -749,34 +742,34 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-LABEL: divergent_cfg: ; GFX1032DAGISEL: ; %bb.0: ; %entry ; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, -1 ; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032DAGISEL-NEXT: s_min_u32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_min_u32 s3, s3, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -785,54 +778,52 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-LABEL: divergent_cfg: ; GFX1032GISEL: ; %bb.0: ; %entry ; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 -; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else -; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: s_mov_b32 s2, s2 ; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s3, s3 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032GISEL-NEXT: s_mov_b32 s0, -1 +; GFX1032GISEL-NEXT: s_mov_b32 s2, -1 ; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032GISEL-NEXT: s_min_u32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_min_u32 s2, s2, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032GISEL-NEXT: .LBB4_5: ; %endif -; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1032GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -848,8 +839,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -859,20 +850,18 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_cfg: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -886,8 +875,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164GISEL-NEXT: .LBB4_5: ; %endif -; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -898,38 +887,36 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_cfg: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr3 ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s3, s[0:1], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, -1 ; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132DAGISEL-NEXT: s_min_u32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_min_u32 s3, s3, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -939,38 +926,36 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_cfg: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else -; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: s_mov_b32 s2, s2 ; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s3, s3 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 +; GFX1132GISEL-NEXT: s_mov_b32 s2, -1 ; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132GISEL-NEXT: .LBB4_5: ; %endif -; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index d521a6c25e462e..47c021769aa56f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -10,104 +10,102 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT0-LABEL: test_barrier: ; VARIANT0: ; %bb.0: ; %entry -; VARIANT0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; VARIANT0-NEXT: s_load_dword s4, s[2:3], 0xb -; VARIANT0-NEXT: s_mov_b32 s3, 0xf000 -; VARIANT0-NEXT: s_mov_b32 s2, 0 +; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; VARIANT0-NEXT: s_load_dword s0, s[0:1], 0xb +; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 +; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VARIANT0-NEXT: s_barrier -; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 -; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[0:3], 0 addr64 +; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) -; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_endpgm ; ; VARIANT1-LABEL: test_barrier: ; VARIANT1: ; %bb.0: ; %entry -; VARIANT1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; VARIANT1-NEXT: s_load_dword s4, s[2:3], 0xb -; VARIANT1-NEXT: s_mov_b32 s3, 0xf000 -; VARIANT1-NEXT: s_mov_b32 s2, 0 +; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; VARIANT1-NEXT: s_load_dword s0, s[0:1], 0xb +; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 +; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_barrier -; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT1-NEXT: s_waitcnt expcnt(0) -; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[0:3], 0 addr64 +; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_waitcnt vmcnt(0) -; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_endpgm ; ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry -; VARIANT2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VARIANT2-NEXT: s_load_dword s4, s[2:3], 0x2c +; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VARIANT2-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT2-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VARIANT2-NEXT: v_mov_b32_e32 v3, s1 -; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; VARIANT2-NEXT: v_mov_b32_e32 v3, s3 +; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier ; VARIANT2-NEXT: global_load_dword v0, v[0:1], off ; VARIANT2-NEXT: s_waitcnt vmcnt(0) -; VARIANT2-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT2-NEXT: s_endpgm ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry -; VARIANT3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VARIANT3-NEXT: s_load_dword s4, s[2:3], 0x2c +; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VARIANT3-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT3-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VARIANT3-NEXT: v_mov_b32_e32 v3, s1 -; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; VARIANT3-NEXT: v_mov_b32_e32 v3, s3 +; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT3-NEXT: s_barrier ; VARIANT3-NEXT: global_load_dword v0, v[0:1], off ; VARIANT3-NEXT: s_waitcnt vmcnt(0) -; VARIANT3-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT3-NEXT: s_endpgm ; ; VARIANT4-LABEL: test_barrier: ; VARIANT4: ; %bb.0: ; %entry -; VARIANT4-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; VARIANT4-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) -; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v2 +; VARIANT4-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT4-NEXT: s_wait_kmcnt 0x0 -; VARIANT4-NEXT: v_xad_u32 v0, v2, -1, s2 -; VARIANT4-NEXT: global_store_b32 v3, v2, s[0:1] +; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2 +; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT4-NEXT: s_wait_storecnt 0x0 ; VARIANT4-NEXT: s_barrier_signal -1 ; VARIANT4-NEXT: s_barrier_wait -1 -; VARIANT4-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT4-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; VARIANT4-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; VARIANT4-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT4-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT4-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; VARIANT4-NEXT: global_load_b32 v0, v[0:1], off +; VARIANT4-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo +; VARIANT4-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT4-NEXT: s_wait_loadcnt 0x0 ; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT4-NEXT: s_nop 0 @@ -116,22 +114,20 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT5-LABEL: test_barrier: ; VARIANT5: ; %bb.0: ; %entry -; VARIANT5-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; VARIANT5-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) -; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v2 +; VARIANT5-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT5-NEXT: s_wait_kmcnt 0x0 -; VARIANT5-NEXT: v_xad_u32 v0, v2, -1, s2 -; VARIANT5-NEXT: global_store_b32 v3, v2, s[0:1] +; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s2 +; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT5-NEXT: s_barrier_signal -1 ; VARIANT5-NEXT: s_barrier_wait -1 -; VARIANT5-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VARIANT5-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT5-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; VARIANT5-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; VARIANT5-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT5-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT5-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; VARIANT5-NEXT: global_load_b32 v0, v[0:1], off +; VARIANT5-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo +; VARIANT5-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT5-NEXT: s_wait_loadcnt 0x0 ; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT5-NEXT: s_nop 0 @@ -140,24 +136,23 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT6-LABEL: test_barrier: ; VARIANT6: ; %bb.0: ; %entry -; VARIANT6-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; VARIANT6-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT6-NEXT: v_lshlrev_b32_e32 v5, 2, v0 ; VARIANT6-NEXT: s_wait_kmcnt 0x0 -; VARIANT6-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_and_b32 v4, 0x3ff, v0 ; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 -; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) -; VARIANT6-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_lshlrev_b32 v5, 2, v4 -; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4 -; VARIANT6-NEXT: global_store_b32 v5, v4, s[0:1] -; VARIANT6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0 +; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] ; VARIANT6-NEXT: s_wait_storecnt 0x0 ; VARIANT6-NEXT: s_barrier_signal -1 ; VARIANT6-NEXT: s_barrier_wait -1 +; VARIANT6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT6-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; VARIANT6-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; VARIANT6-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT6-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT6-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; VARIANT6-NEXT: global_load_b32 v0, v[0:1], off +; VARIANT6-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo +; VARIANT6-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT6-NEXT: s_wait_loadcnt 0x0 ; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] ; VARIANT6-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 8bfe996c6a90a3..38a34ec6daf73c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -5,11 +5,10 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -23,11 +22,10 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -53,11 +51,10 @@ entry: define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -71,11 +68,10 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -101,11 +97,10 @@ entry: define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -119,11 +114,10 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -149,12 +143,12 @@ entry: define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_var: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_mov_b32 m0, 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_mov_b32 m0, 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v1, s[0:1] @@ -168,13 +162,11 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v2, 0 -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -230,10 +222,8 @@ define void @test2_s_barrier_signal_var(i32 %arg) { define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -252,10 +242,8 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -290,10 +278,8 @@ entry: define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -312,10 +298,8 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -350,10 +334,8 @@ entry: define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -372,10 +354,8 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -410,11 +390,9 @@ entry: define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_isfirst_var: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_mov_b32 m0, 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -433,11 +411,9 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -542,11 +518,10 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test1_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -560,11 +535,10 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -588,11 +562,10 @@ entry: define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test2_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -606,11 +579,10 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -634,11 +606,10 @@ entry: define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test3_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -652,11 +623,10 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -680,17 +650,15 @@ entry: define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 { ; GCN-LABEL: test4_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_barrier_init m0 ; GCN-NEXT: global_store_b32 v3, v0, s[0:1] @@ -700,11 +668,10 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 s3, 16, s3 @@ -765,12 +732,11 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -780,11 +746,10 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -807,12 +772,11 @@ entry: define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -822,11 +786,10 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -849,12 +812,11 @@ entry: define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -864,11 +826,10 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -891,11 +852,11 @@ entry: define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_barrier_join_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 @@ -908,11 +869,10 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 @@ -964,10 +924,8 @@ define void @test5_s_barrier_join_m0(i32 %arg) { define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_leave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_barrier_leave @@ -985,16 +943,14 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_leave: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_barrier_leave ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 ; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GLOBAL-ISEL-NEXT: s_clause 0x1 @@ -1022,12 +978,11 @@ entry: define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1037,11 +992,10 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1064,12 +1018,11 @@ entry: define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1079,11 +1032,10 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1106,12 +1058,11 @@ entry: define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1121,11 +1072,10 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1148,11 +1098,11 @@ entry: define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_wakeup_barrier_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 @@ -1165,11 +1115,10 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 ; ; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 @@ -1221,12 +1170,11 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) { define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, -1 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_get_barrier_state s2, -1 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1234,14 +1182,13 @@ define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1259,12 +1206,11 @@ entry: define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, 1 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_get_barrier_state s2, 1 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1272,14 +1218,13 @@ define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1297,12 +1242,11 @@ entry: define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, 0 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_get_barrier_state s2, 0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1310,14 +1254,13 @@ define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1335,10 +1278,8 @@ entry: define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_get_barrier_state_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1353,10 +1294,8 @@ define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i ; ; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1413,11 +1352,10 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) { define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test_barrier_convert: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1431,11 +1369,10 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test_barrier_convert: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll index bc7052132a87b0..4a404af54188d6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll @@ -37,7 +37,7 @@ define void @test_s_sleep_var2() { define amdgpu_kernel void @test_s_sleep_var3(i32 %arg) { ; GCN-LABEL: test_s_sleep_var3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GCN-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_sleep_var s0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index 527627a5a2f67d..c2e74eb05d1645 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -5,11 +5,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 ; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16 @@ -73,11 +72,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v40, 5, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16 @@ -177,11 +175,10 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 @@ -259,11 +256,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index a29e2298210a3a..fdcb1773d0a3f4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -7,12 +7,11 @@ declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16( define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; GCN-NEXT: ds_load_b128 v[8:11], v0 @@ -59,12 +58,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 @@ -149,131 +147,127 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v16, 0x3ff, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v18, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GCN-NEXT: v_lshl_add_u32 v17, v16, 5, s0 -; GCN-NEXT: v_lshl_add_u32 v16, v16, 4, s1 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:1024 -; GCN-NEXT: ds_load_b128 v[0:3], v17 -; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 +; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s0 +; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:1024 +; GCN-NEXT: ds_load_b128 v[1:4], v17 +; GCN-NEXT: ds_load_b128 v[5:8], v17 offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x2 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:2560 -; GCN-NEXT: v_mov_b32_e32 v16, s1 +; GCN-NEXT: ds_store_b128 v0, v[13:16] +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:2560 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:512 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:4608 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:512 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:4608 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1024 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:7168 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1024 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:7168 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1536 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:10240 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1536 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:10240 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2048 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:2048 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v16, 5, s0 -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v16, v16, 4, s1 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:1024 -; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 -; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s0 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[1:4], v17 +; EXACTCUTOFF-NEXT: ds_load_b128 v[5:8], v17 offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:2560 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:2560 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:512 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:4608 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:512 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:4608 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1024 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:7168 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:7168 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1536 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:10240 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1536 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:10240 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2048 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:2048 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 24b8a3c2dc8730..10f09b6390abae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -29,8 +29,7 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -97,8 +96,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -180,27 +178,34 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 +; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 +; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 +; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 -; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 -; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 -; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 +; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 +; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -208,33 +213,24 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:48 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 -; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 +; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 +; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 -; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_lo_u32 v19, v19, v19 ; GCN-NEXT: v_mul_lo_u32 v18, v18, v18 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; GCN-NEXT: v_mul_lo_u32 v17, v17, v17 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 -; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v23, v23, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -245,11 +241,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v22, v22, v22 ; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 ; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 ; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 ; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 ; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 ; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 ; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 ; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] @@ -261,27 +258,34 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -289,33 +293,24 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:48 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v19, v19, v19 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v18, v18, v18 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v17, v17, v17 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v23, v23, v23 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) @@ -326,11 +321,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v22, v22, v22 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] @@ -385,23 +381,23 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0 ; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 ; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -409,7 +405,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -423,17 +418,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 +; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -442,7 +430,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 ; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -450,47 +439,53 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0 ; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -498,7 +493,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -512,17 +506,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -531,7 +518,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -539,25 +527,31 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -620,9 +614,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 @@ -727,9 +720,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 @@ -870,9 +862,8 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1004,9 +995,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -1198,9 +1188,9 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out, <5 x float> %in1) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1212,8 +1202,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_add_f32_e32 v4, v6, v4 ; GCN-NEXT: v_exp_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1288,7 +1277,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_mul_f32_e32 v4, s7, v3 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_rndne_f32_e32 v10, v4 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x54 +; GCN-NEXT: s_load_dword s8, s[0:1], 0x54 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; GCN-NEXT: v_sub_f32_e32 v1, v4, v10 @@ -1324,7 +1313,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1335,8 +1324,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ; kill: killed $sgpr2_sgpr3 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1383,9 +1372,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -1397,8 +1386,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s2, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1473,7 +1461,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s7, v3 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4 -; EXACTCUTOFF-NEXT: s_load_dword s8, s[2:3], 0x54 +; EXACTCUTOFF-NEXT: s_load_dword s8, s[0:1], 0x54 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10 @@ -1509,7 +1497,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s3, v0 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1520,8 +1508,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr2_sgpr3 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s3 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll index 363c54d4abe908..eb30484ea7f19e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_doorbell: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -16,7 +16,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_doorbell: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -32,7 +32,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_ddid: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -43,7 +43,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_ddid: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -59,7 +59,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_tma: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -76,7 +76,7 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_realtime: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -93,7 +93,7 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_savewave: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -104,7 +104,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_savewave: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -120,7 +120,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_tba: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -137,7 +137,7 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_0_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -148,7 +148,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_0_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -164,7 +164,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_99999_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index 114d2d099ab7b1..fc33206845a713 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -24,7 +24,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 1 @@ -39,7 +39,7 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -61,7 +61,7 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -77,17 +77,17 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 56 +; GCN-NEXT: s_cmp_lg_u32 s3, 56 ; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc1 .LBB4_3 ; GCN-NEXT: ; %bb.1: ; %Flow @@ -127,8 +127,8 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x40400000 @@ -147,7 +147,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -171,8 +171,8 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x10001 @@ -191,8 +191,8 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x3c003c00 @@ -211,7 +211,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -235,7 +235,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1.0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -259,8 +259,8 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x3f803f80 @@ -279,7 +279,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x10001 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -303,7 +303,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -327,7 +327,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -351,7 +351,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -373,8 +373,8 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -392,8 +392,8 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -411,8 +411,8 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -430,8 +430,8 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll index c1f1782ea5a87f..5401de0b082883 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -5,11 +5,7 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffs ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen offset:24 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 @@ -22,11 +18,7 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset_ ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -37,11 +29,7 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffs ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen slc +; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen slc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -52,11 +40,7 @@ define void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vof ; CHECK-LABEL: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s18 idxen offen offset:24 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[4:7], s8 idxen offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll index 78204dfefc80cc..e0e4f950cc16c2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll @@ -10,7 +10,7 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsr ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -25,7 +25,7 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgp ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %unused = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index 10059960030446..864244b6cebcf9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -8,11 +8,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen +; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -21,11 +17,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -34,7 +26,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -45,7 +37,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -56,29 +48,21 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -89,7 +73,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -99,11 +83,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -112,11 +92,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -125,7 +101,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -136,7 +112,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -146,11 +122,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX908-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s18 idxen offen +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[4:7], s8 idxen offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -159,11 +131,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[8:11], s18 idxen offen +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[4:7], s8 idxen offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -172,7 +140,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s6 idxen offen +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s4 idxen offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -183,7 +151,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen +; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index 5f6a67e4660209..ba6005e004efc4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -9,11 +9,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen glc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -22,7 +18,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -33,7 +29,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -45,18 +41,14 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen glc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -67,7 +59,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -80,11 +72,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen glc slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -93,7 +81,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen sc0 nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -104,7 +92,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -117,11 +105,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[8:11], s18 idxen offen glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[4:7], s8 idxen offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -130,7 +114,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s6 idxen offen sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s4 idxen offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -141,7 +125,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll index bd803c380e90a5..1fb5d53d5fd826 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll @@ -10,40 +10,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -54,7 +42,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -65,40 +53,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -109,7 +85,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -121,40 +97,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s6 idxen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -165,7 +129,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -176,40 +140,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen glc slc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -220,7 +172,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -231,39 +183,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: @@ -273,7 +213,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -283,39 +223,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: @@ -325,7 +253,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -337,39 +265,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s6 idxen +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: @@ -379,7 +295,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -389,39 +305,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen slc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen slc ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: @@ -431,7 +335,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -449,14 +353,14 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX6-NEXT: v_readfirstlane_b32 s10, v3 ; GFX6-NEXT: v_readfirstlane_b32 s11, v4 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] +; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -473,14 +377,14 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX7-NEXT: v_readfirstlane_b32 s10, v3 ; GFX7-NEXT: v_readfirstlane_b32 s11, v4 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] +; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] @@ -490,25 +394,25 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v1 ; GFX10-NEXT: v_readfirstlane_b32 s9, v2 ; GFX10-NEXT: v_readfirstlane_b32 s10, v3 ; GFX10-NEXT: v_readfirstlane_b32 s11, v4 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[3:4] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[3:4] +; GFX10-NEXT: s_and_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_and_saveexec_b32 s5, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll index 4f9bac584a78e4..b859147b6dc6b2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll @@ -9,22 +9,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -35,22 +27,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -62,22 +46,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -88,22 +64,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -114,22 +82,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -140,22 +100,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -168,22 +120,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -194,22 +138,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -228,14 +164,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-NEXT: v_readfirstlane_b32 s10, v4 ; GFX6-NEXT: v_readfirstlane_b32 s11, v5 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] +; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -252,14 +188,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX7-NEXT: v_readfirstlane_b32 s10, v4 ; GFX7-NEXT: v_readfirstlane_b32 s11, v5 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] +; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll index c9b50eddc94eef..87055db9a58f09 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll @@ -10,40 +10,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -54,7 +42,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -65,40 +53,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -109,7 +85,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -121,40 +97,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s6 idxen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -165,7 +129,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -176,40 +140,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen glc slc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -220,7 +172,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -231,39 +183,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: @@ -273,7 +213,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -283,39 +223,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: @@ -325,7 +253,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -337,39 +265,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s6 idxen +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: @@ -379,7 +295,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -389,39 +305,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen slc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen slc ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: @@ -431,7 +335,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -449,14 +353,14 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX6-NEXT: v_readfirstlane_b32 s10, v3 ; GFX6-NEXT: v_readfirstlane_b32 s11, v4 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] +; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -473,14 +377,14 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX7-NEXT: v_readfirstlane_b32 s10, v3 ; GFX7-NEXT: v_readfirstlane_b32 s11, v4 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] +; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] @@ -490,25 +394,25 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v1 ; GFX10-NEXT: v_readfirstlane_b32 s9, v2 ; GFX10-NEXT: v_readfirstlane_b32 s10, v3 ; GFX10-NEXT: v_readfirstlane_b32 s11, v4 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[3:4] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[3:4] +; GFX10-NEXT: s_and_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_and_saveexec_b32 s5, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll index 01bc833d59be79..5c23a86dab33ab 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll @@ -9,22 +9,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -35,22 +27,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -62,22 +46,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -88,22 +64,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -114,22 +82,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -140,22 +100,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -168,22 +120,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -194,22 +138,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -228,14 +164,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-NEXT: v_readfirstlane_b32 s10, v4 ; GFX6-NEXT: v_readfirstlane_b32 s11, v5 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] +; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -252,14 +188,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX7-NEXT: v_readfirstlane_b32 s10, v4 ; GFX7-NEXT: v_readfirstlane_b32 s11, v5 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] +; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll index 38fdcf47171aff..439742d6b315d3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll @@ -8,40 +8,40 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -57,43 +57,43 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s4, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -109,29 +109,29 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], v3, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -139,13 +139,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -153,9 +153,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -174,30 +174,30 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s5, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s7, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s7, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s8 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s7 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -205,12 +205,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -218,9 +218,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index 1da076c6523990..22ec22dc2db024 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -11,40 +11,40 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -56,8 +56,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 ; GFX12-PACKED-LABEL: tbuffer_store_d16_x: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -73,43 +73,43 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s4, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -121,8 +121,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -138,29 +138,29 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], v3, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -168,13 +168,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -182,9 +182,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -198,8 +198,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-SDAG: ; %bb.0: ; %main_body ; GFX12-PACKED-SDAG-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 -; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 +; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4 @@ -213,8 +213,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body ; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 +; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4 ; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -233,30 +233,30 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s5, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s7, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s7, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s8 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s7 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -264,12 +264,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -277,9 +277,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -292,8 +292,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 279a64adfbda15..0755dcddd8f46e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -36,7 +36,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_arg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -50,7 +50,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -69,7 +69,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_imm_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -83,7 +83,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_imm_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -102,7 +102,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_imm_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s8, 0x7b @@ -117,7 +117,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, ; ; VI-LABEL: bfe_u32_imm_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s8, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -137,7 +137,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_reg_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_reg_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -162,7 +162,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_imm_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -172,7 +172,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_imm_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -187,7 +187,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zextload_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -204,7 +204,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: bfe_u32_zextload_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -229,7 +229,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -248,7 +248,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: bfe_u32_zext_in_reg_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -275,7 +275,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -294,7 +294,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: bfe_u32_zext_in_reg_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -321,7 +321,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -341,7 +341,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -389,7 +389,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -417,7 +417,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -437,7 +437,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -465,7 +465,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -484,7 +484,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou ; ; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -511,7 +511,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -529,7 +529,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -553,7 +553,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -563,7 +563,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -580,7 +580,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -590,7 +590,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -607,7 +607,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -617,7 +617,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -635,7 +635,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -653,7 +653,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -679,7 +679,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -698,7 +698,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -724,7 +724,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -742,7 +742,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -767,7 +767,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -785,7 +785,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -810,7 +810,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -828,7 +828,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -852,7 +852,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -870,7 +870,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -894,7 +894,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -912,7 +912,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -936,7 +936,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -954,7 +954,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -979,7 +979,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -997,7 +997,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1021,7 +1021,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1047,7 +1047,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1057,7 +1057,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1097,7 +1097,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -1 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1 @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1197,7 +1197,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1207,7 +1207,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1222,7 +1222,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1232,7 +1232,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1272,7 +1272,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1307,7 +1307,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1322,7 +1322,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1347,7 +1347,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1357,7 +1357,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1382,7 +1382,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 40 @@ -1407,7 +1407,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 40 @@ -1422,7 +1422,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_15: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_15: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1447,7 +1447,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_17: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1482,7 +1482,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_17: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1497,7 +1497,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1507,7 +1507,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1526,45 +1526,47 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0, ; SI-LABEL: simplify_bfe_u32_multi_use_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 63, v0 ; SI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: simplify_bfe_u32_multi_use_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 63, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 { @@ -1579,11 +1581,11 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0 define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1591,8 +1593,8 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1609,7 +1611,7 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: v_lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s2, s3 @@ -1623,7 +1625,7 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 ; ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,11 +1645,11 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1655,8 +1657,8 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1673,11 +1675,11 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1685,8 +1687,8 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1703,11 +1705,11 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: shl_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x150002 +; SI-NEXT: s_bfe_u32 s4, s2, 0x150002 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1715,8 +1717,8 @@ define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: shl_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll index abce1f6cd8f84a..ab29ca4a997348 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll @@ -1,8 +1,7 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o %t.bc -; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll index eaee8ec73fe411..47f988fc17d281 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll @@ -1,10 +1,9 @@ -; RUN: opt -mtriple=amdgcn-- -passes=amdgpu-attributor -o %t.bc %s -; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %t.bc | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %t.bc | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii < %t.bc | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global < %t.bc | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %t.bc | FileCheck -check-prefixes=ALL,PACKED-TID %s -; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %t.bc | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 9d93ca65683c42..31f1085dd76ee4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -14,7 +14,7 @@ declare double @llvm.amdgcn.writelane.f64(double, i32, double) #0 define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1010-SDAG-LABEL: test_writelane_sreg_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s4, s[0:1], 0x0 @@ -40,7 +40,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0 @@ -54,7 +54,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX802-GISEL-LABEL: test_writelane_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -68,7 +68,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1010-GISEL-LABEL: test_writelane_sreg_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s4, s[0:1], 0x0 @@ -80,7 +80,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -118,24 +118,24 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX1010-SDAG-LABEL: test_writelane_sreg_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -151,8 +151,8 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; ; GFX802-GISEL-LABEL: test_writelane_sreg_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -169,24 +169,24 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX1010-GISEL-LABEL: test_writelane_sreg_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -208,8 +208,8 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -226,24 +226,24 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX1010-SDAG-LABEL: test_writelane_sreg_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -259,8 +259,8 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; ; GFX802-GISEL-LABEL: test_writelane_sreg_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -277,24 +277,24 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX1010-GISEL-LABEL: test_writelane_sreg_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -316,8 +316,8 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -331,8 +331,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -345,23 +345,23 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s2 -; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s0 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -375,8 +375,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -389,15 +389,15 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s2 -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s0 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -410,8 +410,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -427,41 +427,41 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 0, s4 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s4 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -477,33 +477,33 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s4 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s4 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 0, s4 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -516,8 +516,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8 ; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -535,8 +535,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -544,35 +544,35 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX1010-SDAG-NEXT: s_mov_b32 s2, 0x40400000 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX1100-SDAG-NEXT: s_mov_b32 s2, 0x40400000 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-SDAG-NEXT: s_mov_b32 s0, 0x40400000 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s0, s4 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s4 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8 ; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -590,8 +590,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -599,26 +599,26 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1010-GISEL-NEXT: s_mov_b32 s2, 0x40400000 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 0, s4 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 0, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s2, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1100-GISEL-NEXT: s_mov_b32 s2, 0x40400000 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 0, s4 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -631,7 +631,7 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -654,7 +654,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:4 @@ -671,9 +671,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 @@ -692,7 +690,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -716,7 +714,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dword v0, v0, s[2:3] offset:4 @@ -733,9 +731,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 @@ -764,7 +760,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -789,7 +785,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -808,11 +804,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8 ; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -831,7 +825,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -856,7 +850,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8 @@ -874,9 +868,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8 @@ -907,7 +899,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -934,7 +926,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -954,11 +946,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8 ; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -978,7 +968,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,7 +995,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8 @@ -1024,9 +1014,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8 @@ -1059,8 +1047,8 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-SDAG-NEXT: ;;#ASMSTART ; GFX802-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX802-SDAG-NEXT: ;;#ASMEND @@ -1079,8 +1067,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1010-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: ;;#ASMSTART ; GFX1010-SDAG-NEXT: s_mov_b32 m0, -1 @@ -1096,26 +1084,26 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1100-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: ;;#ASMSTART ; GFX1100-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX1100-SDAG-NEXT: ;;#ASMEND ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, m0, s2 -; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, m0, s0 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-GISEL-NEXT: ;;#ASMSTART ; GFX802-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX802-GISEL-NEXT: ;;#ASMEND @@ -1134,8 +1122,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1010-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-GISEL-NEXT: ;;#ASMSTART ; GFX1010-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX1010-GISEL-NEXT: ;;#ASMEND @@ -1151,18 +1139,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1100-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-GISEL-NEXT: ;;#ASMSTART ; GFX1100-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX1100-GISEL-NEXT: ;;#ASMEND ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, m0, s2 -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, m0, s0 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -1176,8 +1164,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1191,8 +1179,8 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1010-SDAG-LABEL: test_writelane_imm_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1205,23 +1193,23 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1100-SDAG-LABEL: test_writelane_imm_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32 -; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, 32 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1235,8 +1223,8 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1010-GISEL-LABEL: test_writelane_imm_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1249,15 +1237,15 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1100-GISEL-LABEL: test_writelane_imm_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32 -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, 32 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -1270,7 +1258,7 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1285,7 +1273,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1010-SDAG-LABEL: test_writelane_imm_i64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1299,7 +1287,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1100-SDAG-LABEL: test_writelane_imm_i64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1315,7 +1303,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX802-GISEL-LABEL: test_writelane_imm_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1330,7 +1318,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1010-GISEL-LABEL: test_writelane_imm_i64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1344,7 +1332,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1100-GISEL-LABEL: test_writelane_imm_i64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1366,7 +1354,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1381,7 +1369,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1010-SDAG-LABEL: test_writelane_imm_f64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1395,7 +1383,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1100-SDAG-LABEL: test_writelane_imm_f64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1411,7 +1399,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX802-GISEL-LABEL: test_writelane_imm_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1426,7 +1414,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1010-GISEL-LABEL: test_writelane_imm_f64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1440,7 +1428,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1100-GISEL-LABEL: test_writelane_imm_f64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1462,10 +1450,10 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s6 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0 @@ -1476,11 +1464,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s6 ; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm @@ -1488,8 +1476,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 @@ -1501,10 +1489,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0 @@ -1515,11 +1503,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm @@ -1527,8 +1515,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -1545,12 +1533,12 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1562,30 +1550,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x2 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s5, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s4, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x2 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s1, s2 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, s2 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1593,13 +1581,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 @@ -1610,30 +1598,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x2 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s4, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s5, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x2 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s1, s2 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1646,12 +1634,12 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1663,30 +1651,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x2 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s5, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s4, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x2 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s1, s2 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, s2 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1694,13 +1682,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 @@ -1711,30 +1699,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x2 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s4, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s5, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x2 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s1, s2 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1747,7 +1735,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 @@ -1759,7 +1747,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1769,7 +1757,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1781,7 +1769,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 @@ -1793,7 +1781,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1803,7 +1791,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1820,8 +1808,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1836,22 +1824,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -1865,12 +1853,12 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 @@ -1881,22 +1869,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s4 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -1915,8 +1903,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1931,22 +1919,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -1960,12 +1948,12 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 @@ -1976,22 +1964,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s4 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index 5cf457d1753b30..eeddb3d5b81923 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a) define amdgpu_kernel void @ceil_f16( ; SI-LABEL: ceil_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @ceil_f16( ; ; VI-LABEL: ceil_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @ceil_f16( ; ; GFX11-LABEL: ceil_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -68,7 +68,7 @@ define amdgpu_kernel void @ceil_f16( ; ; GFX11-FAKE16-LABEL: ceil_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -104,7 +104,7 @@ entry: define amdgpu_kernel void @ceil_v2f16( ; SI-LABEL: ceil_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -130,7 +130,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; VI-LABEL: ceil_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -150,7 +150,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; GFX11-LABEL: ceil_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -179,7 +179,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; GFX11-FAKE16-LABEL: ceil_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 5514efa6838e73..fcc4cb3436fd7a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: cos_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -46,7 +46,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: cos_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -58,7 +58,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: cos_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -70,7 +70,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: cos_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -91,7 +91,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -121,7 +121,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: cos_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -142,7 +142,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: cos_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -158,7 +158,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: cos_v2f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -174,7 +174,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: cos_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 142145098df87f..3a867879bb809b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -12,34 +12,33 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s0, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -47,34 +46,33 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 +; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -82,16 +80,16 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -99,36 +97,36 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -136,10 +134,10 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 @@ -164,29 +162,29 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -338,7 +336,7 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -390,7 +388,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -442,7 +440,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 @@ -481,7 +479,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-GISEL-LABEL: s_exp_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -520,7 +518,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -562,7 +560,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_exp_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -853,25 +851,25 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 @@ -917,7 +915,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 @@ -926,19 +923,19 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 @@ -990,7 +987,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -999,11 +995,11 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 @@ -1052,10 +1048,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-GISEL-LABEL: s_exp_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 @@ -1105,10 +1101,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x32a5705f +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 @@ -1160,10 +1156,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-GISEL-LABEL: s_exp_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 @@ -1594,26 +1590,26 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 @@ -1677,7 +1673,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 @@ -1686,28 +1681,29 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x39a3b295 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 @@ -1723,7 +1719,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 ; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 @@ -1748,7 +1744,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 @@ -1769,7 +1764,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1778,11 +1772,11 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1793,8 +1787,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 @@ -1839,16 +1833,17 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 @@ -1910,11 +1905,11 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1926,7 +1921,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 @@ -1972,16 +1967,17 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 4d981d27c309ea..a162949587481e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -14,34 +14,33 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s0, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -49,34 +48,33 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 +; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -84,16 +82,16 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -101,36 +99,36 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -138,10 +136,10 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 @@ -166,29 +164,29 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -340,7 +338,7 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -392,7 +390,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -444,7 +442,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 @@ -483,7 +481,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -522,7 +520,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -564,7 +562,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -855,25 +853,25 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 @@ -919,7 +917,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 @@ -928,19 +925,19 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 @@ -992,7 +989,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -1001,11 +997,11 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 @@ -1054,10 +1050,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 @@ -1107,10 +1103,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x33979a37 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 @@ -1162,10 +1158,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_exp10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 @@ -1596,26 +1592,26 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 @@ -1679,7 +1675,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 @@ -1688,28 +1683,29 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3a2784bc ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 @@ -1725,7 +1721,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 ; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 @@ -1750,7 +1746,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 @@ -1771,7 +1766,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1780,11 +1774,11 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1795,8 +1789,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 @@ -1841,16 +1835,17 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 @@ -1912,11 +1907,11 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1928,7 +1923,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 @@ -1974,16 +1969,17 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 9f80e66e8f8731..36e78975cdb015 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -12,17 +12,17 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_exp2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 @@ -31,35 +31,35 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -69,14 +69,14 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc @@ -88,8 +88,8 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -101,25 +101,24 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -173,7 +172,7 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -199,7 +198,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -223,7 +222,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 @@ -247,7 +246,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -271,7 +270,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 @@ -294,7 +293,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -381,8 +380,8 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -413,8 +412,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 @@ -445,8 +444,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -476,11 +475,11 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -507,8 +506,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -532,16 +531,16 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -656,45 +655,45 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v6, s6, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v8, s5, v8 -; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v6, s2, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v8, s1, v8 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_exp_f32_e32 v8, v8 ; SI-SDAG-NEXT: v_exp_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -730,8 +729,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -767,8 +766,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -804,8 +803,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -835,13 +834,13 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -871,7 +870,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll index ece55c7f7dceaa..e8d037c5ff53e0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.floor.v2f16(<2 x half> %a) define amdgpu_kernel void @floor_f16( ; SI-LABEL: floor_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @floor_f16( ; ; VI-LABEL: floor_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @floor_f16( ; ; GFX11-LABEL: floor_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -68,7 +68,7 @@ define amdgpu_kernel void @floor_f16( ; ; GFX11-FAKE16-LABEL: floor_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -105,7 +105,7 @@ entry: define amdgpu_kernel void @floor_v2f16( ; SI-LABEL: floor_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; VI-LABEL: floor_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -151,7 +151,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; GFX11-LABEL: floor_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; GFX11-FAKE16-LABEL: floor_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index edcdd323cb0aee..a2e30603b6afcd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -14,7 +14,7 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> define amdgpu_kernel void @fmuladd_f16( ; SI-LABEL: fmuladd_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -48,7 +48,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; VI-FLUSH-LABEL: fmuladd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -76,7 +76,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; VI-DENORM-LABEL: fmuladd_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 ; VI-DENORM-NEXT: s_mov_b32 s10, -1 ; VI-DENORM-NEXT: s_mov_b32 s14, s10 @@ -104,7 +104,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -134,7 +134,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 @@ -162,7 +162,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX11-FLUSH-LABEL: fmuladd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -195,7 +195,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX11-DENORM-LABEL: fmuladd_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 @@ -237,131 +237,131 @@ define amdgpu_kernel void @fmuladd_f16( define amdgpu_kernel void @fmuladd_f16_imm_a( ; SI-LABEL: fmuladd_f16_imm_a: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-FLUSH-LABEL: fmuladd_f16_imm_a: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 -; VI-FLUSH-NEXT: s_mov_b32 s10, -1 -; VI-FLUSH-NEXT: s_mov_b32 s14, s10 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s2 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_mov_b32 s12, s6 ; VI-FLUSH-NEXT: s_mov_b32 s13, s7 -; VI-FLUSH-NEXT: s_mov_b32 s15, s11 -; VI-FLUSH-NEXT: s_mov_b32 s2, s10 -; VI-FLUSH-NEXT: s_mov_b32 s3, s11 +; VI-FLUSH-NEXT: s_mov_b32 s15, s3 +; VI-FLUSH-NEXT: s_mov_b32 s10, s2 +; VI-FLUSH-NEXT: s_mov_b32 s11, s3 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s8, s4 -; VI-FLUSH-NEXT: s_mov_b32 s9, s5 +; VI-FLUSH-NEXT: s_mov_b32 s0, s4 +; VI-FLUSH-NEXT: s_mov_b32 s1, s5 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1 -; VI-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-FLUSH-NEXT: s_endpgm ; ; VI-DENORM-LABEL: fmuladd_f16_imm_a: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 -; VI-DENORM-NEXT: s_mov_b32 s10, -1 -; VI-DENORM-NEXT: s_mov_b32 s14, s10 +; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s2, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s2 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_mov_b32 s12, s6 ; VI-DENORM-NEXT: s_mov_b32 s13, s7 -; VI-DENORM-NEXT: s_mov_b32 s15, s11 -; VI-DENORM-NEXT: s_mov_b32 s2, s10 -; VI-DENORM-NEXT: s_mov_b32 s3, s11 +; VI-DENORM-NEXT: s_mov_b32 s15, s3 +; VI-DENORM-NEXT: s_mov_b32 s10, s2 +; VI-DENORM-NEXT: s_mov_b32 s11, s3 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: s_movk_i32 s0, 0x4200 -; VI-DENORM-NEXT: s_mov_b32 s8, s4 -; VI-DENORM-NEXT: s_mov_b32 s9, s5 -; VI-DENORM-NEXT: v_fma_f16 v0, v0, s0, v1 -; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-DENORM-NEXT: s_mov_b32 s0, s4 +; VI-DENORM-NEXT: s_movk_i32 s4, 0x4200 +; VI-DENORM-NEXT: s_mov_b32 s1, s5 +; VI-DENORM-NEXT: v_fma_f16 v0, v0, s4, v1 +; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-DENORM-NEXT: s_endpgm ; ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_a: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s2, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s3, s11 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s8, s4 +; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s9, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_f16_imm_a: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 -; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX10-DENORM-NEXT: s_mov_b32 s2, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s3, s11 +; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s10, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s11, s3 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s8, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s9, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1 -; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_a: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -388,8 +388,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; GFX11-DENORM-LABEL: fmuladd_f16_imm_a: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 @@ -423,131 +423,131 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( define amdgpu_kernel void @fmuladd_f16_imm_b( ; SI-LABEL: fmuladd_f16_imm_b: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-FLUSH-LABEL: fmuladd_f16_imm_b: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 -; VI-FLUSH-NEXT: s_mov_b32 s10, -1 -; VI-FLUSH-NEXT: s_mov_b32 s14, s10 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s2 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_mov_b32 s12, s6 ; VI-FLUSH-NEXT: s_mov_b32 s13, s7 -; VI-FLUSH-NEXT: s_mov_b32 s15, s11 -; VI-FLUSH-NEXT: s_mov_b32 s2, s10 -; VI-FLUSH-NEXT: s_mov_b32 s3, s11 +; VI-FLUSH-NEXT: s_mov_b32 s15, s3 +; VI-FLUSH-NEXT: s_mov_b32 s10, s2 +; VI-FLUSH-NEXT: s_mov_b32 s11, s3 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s8, s4 -; VI-FLUSH-NEXT: s_mov_b32 s9, s5 +; VI-FLUSH-NEXT: s_mov_b32 s0, s4 +; VI-FLUSH-NEXT: s_mov_b32 s1, s5 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1 -; VI-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-FLUSH-NEXT: s_endpgm ; ; VI-DENORM-LABEL: fmuladd_f16_imm_b: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 -; VI-DENORM-NEXT: s_mov_b32 s10, -1 -; VI-DENORM-NEXT: s_mov_b32 s14, s10 +; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s2, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s2 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_mov_b32 s12, s6 ; VI-DENORM-NEXT: s_mov_b32 s13, s7 -; VI-DENORM-NEXT: s_mov_b32 s15, s11 -; VI-DENORM-NEXT: s_mov_b32 s2, s10 -; VI-DENORM-NEXT: s_mov_b32 s3, s11 +; VI-DENORM-NEXT: s_mov_b32 s15, s3 +; VI-DENORM-NEXT: s_mov_b32 s10, s2 +; VI-DENORM-NEXT: s_mov_b32 s11, s3 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: s_movk_i32 s0, 0x4200 -; VI-DENORM-NEXT: s_mov_b32 s8, s4 -; VI-DENORM-NEXT: s_mov_b32 s9, s5 -; VI-DENORM-NEXT: v_fma_f16 v0, v0, s0, v1 -; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-DENORM-NEXT: s_mov_b32 s0, s4 +; VI-DENORM-NEXT: s_movk_i32 s4, 0x4200 +; VI-DENORM-NEXT: s_mov_b32 s1, s5 +; VI-DENORM-NEXT: v_fma_f16 v0, v0, s4, v1 +; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-DENORM-NEXT: s_endpgm ; ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_b: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s2, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s3, s11 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s8, s4 +; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s9, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_f16_imm_b: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 -; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX10-DENORM-NEXT: s_mov_b32 s2, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s3, s11 +; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s10, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s11, s3 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s8, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s9, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1 -; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_b: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -574,8 +574,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; GFX11-DENORM-LABEL: fmuladd_f16_imm_b: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 @@ -609,7 +609,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( define amdgpu_kernel void @fmuladd_v2f16( ; SI-LABEL: fmuladd_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -653,7 +653,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; VI-FLUSH-LABEL: fmuladd_v2f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -686,7 +686,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; VI-DENORM-LABEL: fmuladd_v2f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 ; VI-DENORM-NEXT: s_mov_b32 s10, -1 ; VI-DENORM-NEXT: s_mov_b32 s14, s10 @@ -722,7 +722,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX10-FLUSH-LABEL: fmuladd_v2f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -752,7 +752,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX10-DENORM-LABEL: fmuladd_v2f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 @@ -780,7 +780,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX11-FLUSH-LABEL: fmuladd_v2f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -813,7 +813,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX11-DENORM-LABEL: fmuladd_v2f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll index 2bb4cc617e7f17..aca7d3c720ceb5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll @@ -83,7 +83,7 @@ define i32 @strictfp_func_fpmode_i32() strictfp { define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; GFX6-LABEL: kernel_fpmode_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19) ; GFX6-NEXT: s_and_b32 s4, 0x7f3ff, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -95,7 +95,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX7-LABEL: kernel_fpmode_i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19) ; GFX7-NEXT: s_and_b32 s4, 0x7f3ff, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX8-LABEL: kernel_fpmode_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 19) ; GFX8-NEXT: s_and_b32 s2, 0x7f3ff, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -119,7 +119,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX9-LABEL: kernel_fpmode_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX9-NEXT: s_and_b32 s2, 0x87f3ff, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -130,7 +130,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX10-LABEL: kernel_fpmode_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_and_b32 s2, 0x87f3ff, s2 @@ -141,7 +141,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX11-LABEL: kernel_fpmode_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX11-NEXT: s_and_b32 s2, 0x87f3ff, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index 2e8049e9765e18..ea823f30f26c22 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -14,8 +14,8 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX7CHECK-LABEL: sgpr_isnan_bf16: ; GFX7CHECK: ; %bb.0: -; GFX7CHECK-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7CHECK-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX7CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7CHECK-NEXT: s_mov_b32 s3, 0xf000 ; GFX7CHECK-NEXT: s_mov_b32 s2, -1 ; GFX7CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -28,13 +28,13 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; ; GFX8CHECK-LABEL: sgpr_isnan_bf16: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, 0x7fff -; GFX8CHECK-NEXT: s_movk_i32 s2, 0x7f80 +; GFX8CHECK-NEXT: s_movk_i32 s3, 0x7f80 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s2, v0 +; GFX8CHECK-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s3, v0 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -43,26 +43,26 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_bf16: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x7fff -; GFX9CHECK-NEXT: s_movk_i32 s2, 0x7f80 +; GFX9CHECK-NEXT: s_movk_i32 s0, 0x7f80 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX9CHECK-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s2, v1 +; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s0, v1 ; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9CHECK-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9CHECK-NEXT: s_endpgm ; ; GFX10CHECK-LABEL: sgpr_isnan_bf16: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v1, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4 +; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX10CHECK-NEXT: global_store_dword v1, v0, s[0:1] @@ -71,11 +71,11 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX11CHECK-LABEL: sgpr_isnan_bf16: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v1, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4 +; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2 ; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11CHECK-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index 9c248bd6e8b2aa..da64c379672ef7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -13,8 +13,8 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f16: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7SELDAG-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s3, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s2, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -27,11 +27,11 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX7GLISEL-LABEL: sgpr_isnan_f16: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7GLISEL-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7GLISEL-NEXT: s_mov_b32 s2, -1 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX7GLISEL-NEXT: s_and_b32 s3, s4, 0x7fff +; GFX7GLISEL-NEXT: s_and_b32 s3, s3, 0x7fff ; GFX7GLISEL-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX7GLISEL-NEXT: s_cmpk_gt_u32 s3, 0x7c00 ; GFX7GLISEL-NEXT: s_cselect_b32 s3, 1, 0 @@ -43,10 +43,10 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX8CHECK-LABEL: sgpr_isnan_f16: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s4, 3 +; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -55,23 +55,23 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f16: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s4, 3 -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] -; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[0:1], s4, 3 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9CHECK-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9CHECK-NEXT: s_endpgm ; ; GFX10CHECK-LABEL: sgpr_isnan_f16: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s4, 3 +; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm @@ -79,11 +79,11 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX11CHECK-LABEL: sgpr_isnan_f16: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s4, 3 +; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11CHECK-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll index a807885e0d8539..347e549e7cf566 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll @@ -13,8 +13,8 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f32: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7SELDAG-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s3, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s2, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -25,22 +25,22 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; ; GFX7GLISEL-LABEL: sgpr_isnan_f32: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7GLISEL-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7GLISEL-NEXT: s_mov_b32 s2, -1 -; GFX7GLISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX7GLISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s4, 3 +; GFX7GLISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s3, 3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; GFX7GLISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX7GLISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7GLISEL-NEXT: s_endpgm ; ; GFX8CHECK-LABEL: sgpr_isnan_f32: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 3 +; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -49,23 +49,23 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f32: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 3 -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] -; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 3 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9CHECK-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9CHECK-NEXT: s_endpgm ; ; GFX10CHECK-LABEL: sgpr_isnan_f32: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s4, 3 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm @@ -73,11 +73,11 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX11CHECK-LABEL: sgpr_isnan_f32: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s4, 3 +; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3 ; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] @@ -93,7 +93,7 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f64: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7SELDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s6, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -106,7 +106,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX7GLISEL-LABEL: sgpr_isnan_f64: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7GLISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX7GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] @@ -117,7 +117,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX8SELDAG-LABEL: sgpr_isnan_f64: ; GFX8SELDAG: ; %bb.0: -; GFX8SELDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8SELDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -128,7 +128,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX8GLISEL-LABEL: sgpr_isnan_f64: ; GFX8GLISEL: ; %bb.0: -; GFX8GLISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8GLISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8GLISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -139,7 +139,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f64: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 @@ -149,7 +149,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX10CHECK-LABEL: sgpr_isnan_f64: ; GFX10CHECK: ; %bb.0: -; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3 @@ -159,7 +159,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX11CHECK-LABEL: sgpr_isnan_f64: ; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index c2f6fbfe4667c0..d847af780acab3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 @@ -42,15 +42,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -70,15 +70,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -94,6 +94,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -101,15 +102,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 @@ -125,6 +126,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -132,17 +134,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 ; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 @@ -154,20 +156,20 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -181,18 +183,19 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -204,9 +207,8 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -214,13 +216,14 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -230,11 +233,10 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -316,7 +318,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf @@ -357,7 +359,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -396,7 +398,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -443,7 +445,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -490,7 +492,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217 @@ -528,7 +530,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -566,7 +568,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-SDAG-LABEL: s_log_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -601,7 +603,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-GISEL-LABEL: s_log_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -747,8 +749,8 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -800,8 +802,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -853,7 +855,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 @@ -862,7 +864,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 @@ -919,8 +921,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -984,8 +986,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1035,8 +1037,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -1087,19 +1089,19 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-LABEL: s_log_v3f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -1120,7 +1122,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1143,19 +1145,19 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-LABEL: s_log_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 @@ -1176,7 +1178,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1353,8 +1355,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3377d1cf @@ -1417,8 +1419,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 @@ -1481,8 +1483,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1563,8 +1565,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,8 +1647,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3377d1cf @@ -1708,8 +1710,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 @@ -1772,32 +1774,32 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-SDAG-LABEL: s_log_v4f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 @@ -1833,32 +1835,32 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-GISEL-LABEL: s_log_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 0a1f7ab6fc0ae3..3f060de9f6596d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 @@ -42,15 +42,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -70,15 +70,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -94,6 +94,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -101,15 +102,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 @@ -125,6 +126,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -132,17 +134,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 ; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 @@ -154,20 +156,20 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -181,18 +183,19 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log10_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -204,9 +207,8 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -214,13 +216,14 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log10_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -230,11 +233,10 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -316,7 +318,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf @@ -357,7 +359,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -396,7 +398,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -443,7 +445,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -490,7 +492,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a @@ -528,7 +530,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -566,7 +568,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log10_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -601,7 +603,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log10_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -747,8 +749,8 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -800,8 +802,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -853,7 +855,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 @@ -862,7 +864,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 @@ -919,8 +921,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -984,8 +986,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1035,8 +1037,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -1087,19 +1089,19 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-LABEL: s_log10_v3f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -1120,7 +1122,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1143,19 +1145,19 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-LABEL: s_log10_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 @@ -1176,7 +1178,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1353,8 +1355,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3284fbcf @@ -1417,8 +1419,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a @@ -1481,8 +1483,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1563,8 +1565,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,8 +1647,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3284fbcf @@ -1708,8 +1710,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a @@ -1772,32 +1774,32 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-LABEL: s_log10_v4f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 @@ -1833,32 +1835,32 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-LABEL: s_log10_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 7ca04cc2356053..035b2439eff153 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 @@ -33,35 +33,35 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -71,14 +71,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -90,8 +90,8 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -103,44 +103,43 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 -; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -148,19 +147,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log2_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s4 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -216,7 +215,7 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -242,7 +241,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -266,7 +265,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 @@ -290,7 +289,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -314,7 +313,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 @@ -337,7 +336,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_log2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -360,7 +359,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log2_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 @@ -385,7 +384,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log2_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -473,8 +472,8 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -505,8 +504,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 @@ -537,8 +536,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -568,11 +567,11 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -599,8 +598,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -624,16 +623,16 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -659,32 +658,32 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log2_v3f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v5, s4, v5 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s5, v4 :: v_dual_mul_f32 v5, s4, v5 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0 -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 ; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -693,21 +692,21 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-LABEL: s_log2_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s2 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 @@ -814,45 +813,45 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v8, s5, v8 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s2, v6 +; SI-SDAG-NEXT: v_mul_f32_e32 v8, s1, v8 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_log_f32_e32 v8, v8 ; SI-SDAG-NEXT: v_log_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -888,8 +887,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -925,8 +924,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -962,8 +961,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -993,13 +992,13 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -1029,41 +1028,42 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_v4f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s1 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s7, v2 :: v_dual_mul_f32 v3, s6, v3 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0 ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1072,32 +1072,32 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-LABEL: s_log2_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9 ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 5d3a5800bcdd8f..fa7ee9e8d28ff6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -425,8 +425,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX7-LABEL: s_maximum_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -442,10 +442,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX8-LABEL: s_maximum_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_max_f16_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_max_f16_e32 v1, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: ;;#ASMSTART @@ -456,10 +456,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX9-LABEL: s_maximum_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_max_f16_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_max_f16_e32 v1, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ;;#ASMSTART @@ -485,8 +485,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX10-LABEL: s_maximum_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_max_f16_e64 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ;;#ASMSTART @@ -870,10 +870,10 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX7-LABEL: s_maximum_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s17 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, s16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, s6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, s6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, s4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -897,16 +897,16 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-LABEL: s_maximum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s7, 16 -; GFX8-NEXT: s_lshr_b32 s5, s6, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_max_f16_e32 v1, s5, v0 +; GFX8-NEXT: s_lshr_b32 s6, s5, 16 +; GFX8-NEXT: s_lshr_b32 s7, s4, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_max_f16_e32 v1, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_max_f16_e32 v3, s6, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_max_f16_e32 v3, s4, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -918,17 +918,17 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX9-LABEL: s_maximum_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s4, s7, 16 -; GFX9-NEXT: v_pk_max_f16 v1, s6, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-NEXT: v_pk_max_f16 v1, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 -; GFX9-NEXT: s_lshr_b32 s5, s6, 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -963,13 +963,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_maximum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 -; GFX10-NEXT: s_lshr_b32 s4, s7, 16 -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10-NEXT: v_pk_max_f16 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 +; GFX10-NEXT: s_lshr_b32 s6, s5, 16 +; GFX10-NEXT: s_lshr_b32 s4, s4, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s6 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index e6655aeab7e9b2..f4aa40dbd9bcd7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -401,10 +401,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX7-LABEL: s_maximum_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_max_f32_e32 v1, s6, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_max_f32_e32 v1, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v0 @@ -414,10 +414,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX8-LABEL: s_maximum_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_max_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_max_f32_e32 v1, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -427,10 +427,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX9-LABEL: s_maximum_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_max_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_max_f32_e32 v1, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 @@ -454,8 +454,8 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX10-LABEL: s_maximum_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_max_f32_e64 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 @@ -781,14 +781,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX7-LABEL: s_maximum_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s17 -; GFX7-NEXT: v_max_f32_e32 v1, s7, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_max_f32_e32 v1, s5, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v3, s6, v0 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_max_f32_e32 v3, s4, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v[0:1] @@ -798,14 +798,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-LABEL: s_maximum_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s17 -; GFX8-NEXT: v_max_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_max_f32_e32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_max_f32_e32 v3, s6, v0 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_max_f32_e32 v3, s4, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] @@ -815,14 +815,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX9-LABEL: s_maximum_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_max_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_max_f32_e32 v1, s5, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_max_f32_e32 v3, s6, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_max_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] @@ -850,11 +850,11 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX10-LABEL: s_maximum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v0, s7, s17 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s7, s17 -; GFX10-NEXT: v_max_f32_e64 v2, s6, s16 +; GFX10-NEXT: v_max_f32_e64 v0, s5, s7 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s5, s7 +; GFX10-NEXT: v_max_f32_e64 v2, s4, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s16 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index 9a83c04cad1e3e..e9acbec33f2f39 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -427,10 +427,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX7-LABEL: s_maximum_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_max_f64 v[2:3], s[4:5], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -442,10 +442,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX8-LABEL: s_maximum_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_max_f64 v[2:3], s[4:5], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -457,10 +457,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX9-LABEL: s_maximum_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_max_f64 v[2:3], s[4:5], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -487,8 +487,8 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX10-LABEL: s_maximum_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], s[6:7], s[16:17] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[6:7], s[16:17] +; GFX10-NEXT: v_max_f64 v[0:1], s[4:5], s[6:7] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 ; GFX10-NEXT: ;;#ASMSTART @@ -844,14 +844,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX7-LABEL: s_maximum_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX7-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] -; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -865,14 +865,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-LABEL: s_maximum_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 -; GFX8-NEXT: v_mov_b32_e32 v1, s21 -; GFX8-NEXT: v_mov_b32_e32 v5, s19 -; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] -; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -886,14 +886,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX9-LABEL: s_maximum_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s20 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -907,11 +907,11 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX940-LABEL: s_maximum_v2f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_max_f64 v[2:3], s[2:3], v[0:1] ; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX940-NEXT: v_max_f64 v[4:5], s[0:1], v[0:1] ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -927,14 +927,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX10-LABEL: s_maximum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[20:21] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21] -; GFX10-NEXT: v_max_f64 v[4:5], s[6:7], s[18:19] -; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[6:7], s[18:19] -; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s5 +; GFX10-NEXT: v_max_f64 v[0:1], s[6:7], s[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, s[6:7], s[10:11] +; GFX10-NEXT: v_max_f64 v[4:5], s[4:5], s[8:9] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[8:9] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s4 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:3] ; GFX10-NEXT: ;;#ASMEND @@ -943,10 +943,10 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX11-LABEL: s_maximum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[16:17] -; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[16:17] -; GFX11-NEXT: v_max_f64 v[4:5], s[0:1], s[6:7] -; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], s[0:1], s[4:5] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 @@ -964,8 +964,8 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[16:17] -; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[6:7] +; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[6:7] +; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[4:5] ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use v[0:3] ; GFX12-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index c7913f638798ac..d056a97dc54442 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -13,111 +13,111 @@ declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) define amdgpu_kernel void @maxnum_f16( ; SI-LABEL: maxnum_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX10-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -155,7 +155,7 @@ entry: define amdgpu_kernel void @maxnum_f16_imm_a( ; SI-LABEL: maxnum_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -175,7 +175,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; VI-LABEL: maxnum_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -194,7 +194,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; GFX9-LABEL: maxnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -213,7 +213,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; GFX10-LABEL: maxnum_f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -232,7 +232,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; GFX11-LABEL: maxnum_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -263,7 +263,7 @@ entry: define amdgpu_kernel void @maxnum_f16_imm_b( ; SI-LABEL: maxnum_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -283,7 +283,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; VI-LABEL: maxnum_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -302,7 +302,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; GFX9-LABEL: maxnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -321,7 +321,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; GFX10-LABEL: maxnum_f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -340,7 +340,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; GFX11-LABEL: maxnum_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -371,8 +371,8 @@ entry: define amdgpu_kernel void @maxnum_v2f16( ; SI-LABEL: maxnum_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -396,8 +396,8 @@ define amdgpu_kernel void @maxnum_v2f16( ; ; VI-LABEL: maxnum_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -420,18 +420,18 @@ define amdgpu_kernel void @maxnum_v2f16( ; ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -439,16 +439,16 @@ define amdgpu_kernel void @maxnum_v2f16( ; GFX10-LABEL: maxnum_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 ; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -456,8 +456,8 @@ define amdgpu_kernel void @maxnum_v2f16( ; GFX11-LABEL: maxnum_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -486,7 +486,7 @@ entry: define amdgpu_kernel void @maxnum_v2f16_imm_a( ; SI-LABEL: maxnum_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -506,7 +506,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; VI-LABEL: maxnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -524,7 +524,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; GFX9-LABEL: maxnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -538,7 +538,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; GFX10-LABEL: maxnum_v2f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -551,7 +551,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; GFX11-LABEL: maxnum_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -576,7 +576,7 @@ entry: define amdgpu_kernel void @maxnum_v2f16_imm_b( ; SI-LABEL: maxnum_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -596,7 +596,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; VI-LABEL: maxnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -614,7 +614,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; GFX9-LABEL: maxnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -628,7 +628,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; GFX10-LABEL: maxnum_v2f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -641,7 +641,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; GFX11-LABEL: maxnum_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -667,8 +667,8 @@ entry: define amdgpu_kernel void @maxnum_v3f16( ; SI-LABEL: maxnum_v3f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -697,8 +697,8 @@ define amdgpu_kernel void @maxnum_v3f16( ; ; VI-LABEL: maxnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -725,21 +725,21 @@ define amdgpu_kernel void @maxnum_v3f16( ; ; GFX9-LABEL: maxnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v2, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -748,17 +748,17 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX10-LABEL: maxnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_max_f16 v1, v2, v1 ; GFX10-NEXT: v_pk_max_f16 v0, v3, v0 @@ -769,8 +769,8 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX11-LABEL: maxnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -804,28 +804,28 @@ entry: define amdgpu_kernel void @maxnum_v4f16( ; SI-LABEL: maxnum_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: s_lshr_b32 s2, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; SI-NEXT: s_lshr_b32 s2, s1, 16 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_max_f32_e32 v3, v3, v5 ; SI-NEXT: v_max_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -838,13 +838,13 @@ define amdgpu_kernel void @maxnum_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -876,21 +876,21 @@ define amdgpu_kernel void @maxnum_v4f16( ; ; GFX9-LABEL: maxnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s9, s9 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v2, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -898,17 +898,17 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX10-LABEL: maxnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v0 ; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 @@ -918,8 +918,8 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX11-LABEL: maxnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -951,7 +951,7 @@ entry: define amdgpu_kernel void @fmax_v4f16_imm_a( ; SI-LABEL: fmax_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -980,7 +980,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; VI-LABEL: fmax_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -1007,7 +1007,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; GFX9-LABEL: fmax_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -1026,7 +1026,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; GFX10-LABEL: fmax_v4f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1041,7 +1041,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; GFX11-LABEL: fmax_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 01effc24e741d1..e00ebff751c73e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -351,10 +351,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX8-LABEL: s_minimum_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_min_f16_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_min_f16_e32 v1, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: ;;#ASMSTART @@ -365,10 +365,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX9-LABEL: s_minimum_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_min_f16_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_min_f16_e32 v1, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ;;#ASMSTART @@ -394,8 +394,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX10-LABEL: s_minimum_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f16_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_min_f16_e64 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ;;#ASMSTART @@ -709,16 +709,16 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-LABEL: s_minimum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s7, 16 -; GFX8-NEXT: s_lshr_b32 s5, s6, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_min_f16_e32 v1, s5, v0 +; GFX8-NEXT: s_lshr_b32 s6, s5, 16 +; GFX8-NEXT: s_lshr_b32 s7, s4, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_min_f16_e32 v1, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_min_f16_e32 v3, s6, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_min_f16_e32 v3, s4, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -730,17 +730,17 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX9-LABEL: s_minimum_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s4, s7, 16 -; GFX9-NEXT: v_pk_min_f16 v1, s6, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-NEXT: v_pk_min_f16 v1, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 -; GFX9-NEXT: s_lshr_b32 s5, s6, 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -775,13 +775,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_minimum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 -; GFX10-NEXT: s_lshr_b32 s4, s7, 16 -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10-NEXT: v_pk_min_f16 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 +; GFX10-NEXT: s_lshr_b32 s6, s5, 16 +; GFX10-NEXT: s_lshr_b32 s4, s4, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s6 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index 518fc27c23082b..e056682051aa45 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -401,10 +401,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX7-LABEL: s_minimum_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_min_f32_e32 v1, s6, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_min_f32_e32 v1, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v0 @@ -414,10 +414,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX8-LABEL: s_minimum_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_min_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_min_f32_e32 v1, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -427,10 +427,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX9-LABEL: s_minimum_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_min_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_min_f32_e32 v1, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 @@ -454,8 +454,8 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX10-LABEL: s_minimum_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f32_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_min_f32_e64 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 @@ -781,14 +781,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX7-LABEL: s_minimum_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s17 -; GFX7-NEXT: v_min_f32_e32 v1, s7, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_min_f32_e32 v1, s5, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v3, s6, v0 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_min_f32_e32 v3, s4, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v[0:1] @@ -798,14 +798,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-LABEL: s_minimum_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s17 -; GFX8-NEXT: v_min_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_min_f32_e32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_min_f32_e32 v3, s6, v0 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_min_f32_e32 v3, s4, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] @@ -815,14 +815,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX9-LABEL: s_minimum_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_min_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_min_f32_e32 v1, s5, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_min_f32_e32 v3, s6, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_min_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] @@ -850,11 +850,11 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX10-LABEL: s_minimum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f32_e64 v0, s7, s17 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s7, s17 -; GFX10-NEXT: v_min_f32_e64 v2, s6, s16 +; GFX10-NEXT: v_min_f32_e64 v0, s5, s7 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s5, s7 +; GFX10-NEXT: v_min_f32_e64 v2, s4, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s16 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index 81b892d424b46a..d8462ec2202448 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -427,10 +427,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX7-LABEL: s_minimum_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_min_f64 v[2:3], s[4:5], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -442,10 +442,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX8-LABEL: s_minimum_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_min_f64 v[2:3], s[4:5], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -457,10 +457,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX9-LABEL: s_minimum_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_min_f64 v[2:3], s[4:5], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -487,8 +487,8 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX10-LABEL: s_minimum_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f64 v[0:1], s[6:7], s[16:17] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[6:7], s[16:17] +; GFX10-NEXT: v_min_f64 v[0:1], s[4:5], s[6:7] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 ; GFX10-NEXT: ;;#ASMSTART @@ -844,14 +844,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX7-LABEL: s_minimum_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX7-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] -; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -865,14 +865,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-LABEL: s_minimum_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 -; GFX8-NEXT: v_mov_b32_e32 v1, s21 -; GFX8-NEXT: v_mov_b32_e32 v5, s19 -; GFX8-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] -; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -886,14 +886,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX9-LABEL: s_minimum_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s20 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -907,11 +907,11 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX940-LABEL: s_minimum_v2f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_min_f64 v[2:3], s[2:3], v[0:1] ; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX940-NEXT: v_min_f64 v[4:5], s[0:1], v[0:1] ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -927,14 +927,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX10-LABEL: s_minimum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f64 v[0:1], s[16:17], s[20:21] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21] -; GFX10-NEXT: v_min_f64 v[4:5], s[6:7], s[18:19] -; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[6:7], s[18:19] -; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s5 +; GFX10-NEXT: v_min_f64 v[0:1], s[6:7], s[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, s[6:7], s[10:11] +; GFX10-NEXT: v_min_f64 v[4:5], s[4:5], s[8:9] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[8:9] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s4 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:3] ; GFX10-NEXT: ;;#ASMEND @@ -943,10 +943,10 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX11-LABEL: s_minimum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[16:17] -; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[16:17] -; GFX11-NEXT: v_min_f64 v[4:5], s[0:1], s[6:7] -; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[6:7] +; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], s[0:1], s[4:5] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 @@ -964,8 +964,8 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[16:17] -; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[6:7] +; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[6:7] +; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[4:5] ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use v[0:3] ; GFX12-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 0a004fd7701cfc..f934a2de9247f0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -13,111 +13,111 @@ declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) define amdgpu_kernel void @minnum_f16_ieee( ; SI-LABEL: minnum_f16_ieee: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_f16_ieee: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_f16_ieee: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX10-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_f16_ieee: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -182,7 +182,7 @@ define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 { define amdgpu_kernel void @minnum_f16_imm_a( ; SI-LABEL: minnum_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -202,7 +202,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; VI-LABEL: minnum_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -221,7 +221,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; GFX9-LABEL: minnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -240,7 +240,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; GFX10-LABEL: minnum_f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -259,7 +259,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; GFX11-LABEL: minnum_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -289,7 +289,7 @@ entry: define amdgpu_kernel void @minnum_f16_imm_b( ; SI-LABEL: minnum_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -309,7 +309,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; VI-LABEL: minnum_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -328,7 +328,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; GFX9-LABEL: minnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -347,7 +347,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; GFX10-LABEL: minnum_f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -366,7 +366,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; GFX11-LABEL: minnum_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -396,8 +396,8 @@ entry: define amdgpu_kernel void @minnum_v2f16_ieee( ; SI-LABEL: minnum_v2f16_ieee: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -421,8 +421,8 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; ; VI-LABEL: minnum_v2f16_ieee: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -445,18 +445,18 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -464,16 +464,16 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; GFX10-LABEL: minnum_v2f16_ieee: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 ; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -481,8 +481,8 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; GFX11-LABEL: minnum_v2f16_ieee: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -545,7 +545,7 @@ define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) define amdgpu_kernel void @minnum_v2f16_imm_a( ; SI-LABEL: minnum_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -565,7 +565,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; VI-LABEL: minnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -583,7 +583,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; GFX9-LABEL: minnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -597,7 +597,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; GFX10-LABEL: minnum_v2f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -610,7 +610,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; GFX11-LABEL: minnum_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -634,7 +634,7 @@ entry: define amdgpu_kernel void @minnum_v2f16_imm_b( ; SI-LABEL: minnum_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -654,7 +654,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; VI-LABEL: minnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -672,7 +672,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; GFX9-LABEL: minnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -686,7 +686,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; GFX10-LABEL: minnum_v2f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -699,7 +699,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; GFX11-LABEL: minnum_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -724,8 +724,8 @@ entry: define amdgpu_kernel void @minnum_v3f16( ; SI-LABEL: minnum_v3f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -754,8 +754,8 @@ define amdgpu_kernel void @minnum_v3f16( ; ; VI-LABEL: minnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -782,21 +782,21 @@ define amdgpu_kernel void @minnum_v3f16( ; ; GFX9-LABEL: minnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v2, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -805,17 +805,17 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX10-LABEL: minnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_min_f16 v1, v2, v1 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v0 @@ -826,8 +826,8 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX11-LABEL: minnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -860,28 +860,28 @@ entry: define amdgpu_kernel void @minnum_v4f16( ; SI-LABEL: minnum_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: s_lshr_b32 s2, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; SI-NEXT: s_lshr_b32 s2, s1, 16 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_min_f32_e32 v3, v3, v5 ; SI-NEXT: v_min_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -894,13 +894,13 @@ define amdgpu_kernel void @minnum_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -932,21 +932,21 @@ define amdgpu_kernel void @minnum_v4f16( ; ; GFX9-LABEL: minnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s9, s9 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v2, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -954,17 +954,17 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX10-LABEL: minnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_min_f16 v1, v1, v0 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 @@ -974,8 +974,8 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX11-LABEL: minnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -1006,7 +1006,7 @@ entry: define amdgpu_kernel void @fmin_v4f16_imm_a( ; SI-LABEL: fmin_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; VI-LABEL: fmin_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; GFX9-LABEL: fmin_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -1081,7 +1081,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; GFX10-LABEL: fmin_v4f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; GFX11-LABEL: fmin_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index 53ea253035655c..c3e665fa8269a0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -332,7 +332,7 @@ bb: define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; SI-LABEL: umulo_i64_s: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 @@ -365,7 +365,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX9-LABEL: umulo_i64_s: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s7, s0, s3 ; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -394,7 +394,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX10-LABEL: umulo_i64_s: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mul_i32 s7, s0, s3 ; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -423,7 +423,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX11-LABEL: umulo_i64_s: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s7, s0, s3 ; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -454,7 +454,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX12-LABEL: umulo_i64_s: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 @@ -491,7 +491,7 @@ bb: define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; SI-LABEL: smulo_i64_s: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 @@ -540,7 +540,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX9-LABEL: smulo_i64_s: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s7, s0, s3 ; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -581,7 +581,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX10-LABEL: smulo_i64_s: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mul_i32 s7, s0, s3 ; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -622,7 +622,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX11-LABEL: smulo_i64_s: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s7, s0, s3 ; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -667,7 +667,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX12-LABEL: smulo_i64_s: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll index 3d73f84b6e9a80..826862e1249203 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -17,12 +17,12 @@ define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_x: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x18 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -45,8 +45,8 @@ entry: define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -56,12 +56,12 @@ define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_y: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x1c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -84,8 +84,8 @@ entry: define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -95,12 +95,12 @@ define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_z: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -123,8 +123,8 @@ entry: define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xy: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mul_i32 s4, s4, s5 @@ -135,13 +135,13 @@ define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xy: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_mul_i32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -166,12 +166,12 @@ entry: define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dword s5, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x6 +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_mul_i32 s4, s2, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -179,11 +179,11 @@ define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x18 -; VI-NEXT: s_load_dword s5, s[2:3], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x18 +; VI-NEXT: s_load_dword s3, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s2, s4, s5 +; VI-NEXT: s_mul_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -211,7 +211,7 @@ entry: define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_yz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x7 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mul_i32 s0, s0, s1 @@ -224,7 +224,7 @@ define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_yz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x1c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mul_i32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -254,13 +254,13 @@ entry: define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xyz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 -; SI-NEXT: s_load_dword s6, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 +; SI-NEXT: s_load_dword s2, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s2, s4, s5 -; SI-NEXT: s_add_i32 s4, s2, s6 +; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_add_i32 s4, s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -268,15 +268,15 @@ define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xyz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 -; VI-NEXT: s_load_dword s4, s[2:3], 0x20 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 +; VI-NEXT: s_load_dword s4, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: s_add_i32 s0, s0, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_mul_i32 s2, s2, s3 +; VI-NEXT: s_add_i32 s2, s2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -304,8 +304,8 @@ entry: define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -315,12 +315,12 @@ define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_x_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x18 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -345,8 +345,8 @@ entry: define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -356,12 +356,12 @@ define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_y_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x1c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -386,8 +386,8 @@ entry: define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -397,12 +397,12 @@ define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_z_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 47dd0263d020ea..84afa3b0096ea2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) define amdgpu_kernel void @rint_f16( ; SI-LABEL: rint_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @rint_f16( ; ; GFX89-LABEL: rint_f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @rint_f16( ; ; GFX11-LABEL: rint_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -85,7 +85,7 @@ entry: define amdgpu_kernel void @rint_v2f16( ; SI-LABEL: rint_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -111,7 +111,7 @@ define amdgpu_kernel void @rint_v2f16( ; ; VI-LABEL: rint_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @rint_v2f16( ; ; GFX9-LABEL: rint_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -151,7 +151,7 @@ define amdgpu_kernel void @rint_v2f16( ; ; GFX11-LABEL: rint_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index fc962b1b4a377f..ddbc5ef4e5b600 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; SI-LABEL: round_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s5, 0xfffff ; SI-NEXT: s_mov_b32 s4, s6 @@ -41,7 +41,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; ; CI-LABEL: round_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_brev_b32 s5, -2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 @@ -68,7 +68,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_round_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -108,7 +108,7 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_round_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -141,65 +141,64 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) #0 { ; SI-LABEL: round_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s9, 0xfffff +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 -; SI-NEXT: s_add_i32 s12, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s12 -; SI-NEXT: s_and_b32 s7, s11, 0x80000000 -; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] -; SI-NEXT: s_cmp_lt_i32 s12, 0 -; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s7, s5 -; SI-NEXT: s_cmp_gt_i32 s12, 51 -; SI-NEXT: s_cselect_b32 s12, s10, s4 -; SI-NEXT: s_cselect_b32 s13, s11, s5 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 -; SI-NEXT: s_brev_b32 s7, -2 -; SI-NEXT: s_and_b64 s[2:3], s[14:15], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s2, 0xfc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; SI-NEXT: s_andn2_b64 s[0:1], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s3, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s2, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s3, s1 -; SI-NEXT: s_cmp_gt_i32 s2, 51 -; SI-NEXT: s_cselect_b32 s1, s9, s1 -; SI-NEXT: s_cselect_b32 s0, s8, s0 -; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3] +; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], s3 +; SI-NEXT: s_and_b32 s12, s7, 0x80000000 +; SI-NEXT: s_andn2_b64 s[10:11], s[6:7], s[10:11] +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s10, 0, s10 +; SI-NEXT: s_cselect_b32 s11, s12, s11 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s10, s6, s10 +; SI-NEXT: s_cselect_b32 s11, s7, s11 +; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[2:3]|, 0.5 -; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_cmp_ge_f64_e64 s[12:13], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], s3 +; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] +; SI-NEXT: s_and_b32 s8, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s8, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s6, s4, s6 +; SI-NEXT: s_cselect_b32 s7, s5, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_add_f64 v[2:3], s[4:5], -v[2:3] +; SI-NEXT: s_brev_b32 s12, -2 +; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[2:3]|, 0.5 +; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_add_f64 v[2:3], s[12:13], v[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_bfi_b32 v1, s7, v1, v4 -; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_add_f64 v[2:3], s[10:11], v[0:1] +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_bfi_b32 v1, s12, v1, v4 +; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 @@ -233,151 +232,151 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) #0 { ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s14 -; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s13, 0xfffff +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_brev_b32 s18, -2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s12, s7, 0xb0014 -; SI-NEXT: s_add_i32 s16, s12, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[12:13], s[0:1], s16 -; SI-NEXT: s_and_b32 s15, s7, 0x80000000 -; SI-NEXT: s_andn2_b64 s[12:13], s[6:7], s[12:13] -; SI-NEXT: s_cmp_lt_i32 s16, 0 -; SI-NEXT: s_cselect_b32 s12, 0, s12 -; SI-NEXT: s_cselect_b32 s13, s15, s13 -; SI-NEXT: s_cmp_gt_i32 s16, 51 -; SI-NEXT: s_cselect_b32 s16, s6, s12 -; SI-NEXT: s_cselect_b32 s17, s7, s13 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], s3 +; SI-NEXT: s_and_b32 s16, s7, 0x80000000 +; SI-NEXT: s_andn2_b64 s[14:15], s[6:7], s[14:15] +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s14, 0, s14 +; SI-NEXT: s_cselect_b32 s15, s16, s15 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s14, s6, s14 +; SI-NEXT: s_cselect_b32 s15, s7, s15 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_mov_b32_e32 v1, s15 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[18:19], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: v_cmp_ge_f64_e64 s[16:17], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b64 s[16:17], s[16:17], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], s3 +; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] +; SI-NEXT: s_and_b32 s16, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s16, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s6, s4, s6 +; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 +; SI-NEXT: s_cselect_b32 s7, s5, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b64 s[2:3], s[18:19], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 -; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[0:1], s6 -; SI-NEXT: s_andn2_b64 s[2:3], s[4:5], s[2:3] -; SI-NEXT: s_and_b32 s7, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s6, 0 -; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_cselect_b32 s3, s7, s3 -; SI-NEXT: s_cmp_gt_i32 s6, 51 -; SI-NEXT: s_brev_b32 s15, -2 -; SI-NEXT: s_cselect_b32 s2, s4, s2 -; SI-NEXT: v_bfi_b32 v5, s15, v0, v1 -; SI-NEXT: s_cselect_b32 s3, s5, s3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[16:17], v[4:5] -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[2:3], s[14:15], v[4:5] +; SI-NEXT: v_cmp_ge_f64_e64 s[16:17], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v6, s5 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 -; SI-NEXT: s_add_i32 s6, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s6 +; SI-NEXT: s_and_b64 s[14:15], s[16:17], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_bfe_u32 s3, s11, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] -; SI-NEXT: s_and_b32 s7, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s6, 0 +; SI-NEXT: s_and_b32 s14, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s7, s5 -; SI-NEXT: s_cmp_gt_i32 s6, 51 +; SI-NEXT: s_cselect_b32 s5, s14, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s4, s10, s4 ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: v_bfi_b32 v5, s15, v5, v6 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[0:1]|, 0.5 -; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5] -; SI-NEXT: s_and_b64 s[2:3], s[6:7], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v8, s2 -; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s2, 0xfc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; SI-NEXT: s_andn2_b64 s[0:1], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s3, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s2, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s3, s1 -; SI-NEXT: s_cmp_gt_i32 s2, 51 -; SI-NEXT: s_cselect_b32 s1, s9, s1 -; SI-NEXT: s_cselect_b32 s0, s8, s0 -; SI-NEXT: v_mov_b32_e32 v6, s1 -; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: v_bfi_b32 v5, s18, v5, v6 +; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[4:5] +; SI-NEXT: s_and_b64 s[6:7], s[14:15], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v8, s3 +; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], s3 +; SI-NEXT: s_andn2_b64 s[6:7], s[8:9], s[6:7] +; SI-NEXT: s_and_b32 s10, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s10, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s6, s8, s6 +; SI-NEXT: s_cselect_b32 s7, s9, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v6, s7 ; SI-NEXT: v_add_f64 v[6:7], s[8:9], -v[5:6] ; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5 -; SI-NEXT: v_bfi_b32 v5, s15, v8, v9 -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[6:7]|, 0.5 +; SI-NEXT: v_bfi_b32 v5, s18, v8, v9 ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[4:5] -; SI-NEXT: v_mov_b32_e32 v5, s2 +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: v_mov_b32_e32 v8, s9 -; SI-NEXT: v_bfi_b32 v5, s15, v5, v8 -; SI-NEXT: v_add_f64 v[4:5], s[0:1], v[4:5] -; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: v_bfi_b32 v5, s18, v5, v8 +; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[4:5] +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; CI-NEXT: s_brev_b32 s14, -2 +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; CI-NEXT: s_mov_b32 s15, 0xf000 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] -; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s7 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 +; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 ; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 -; CI-NEXT: v_bfi_b32 v5, s14, v8, v5 +; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v10, s5 -; CI-NEXT: v_bfi_b32 v5, s14, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[6:7], s[8:9], -v[10:11] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[6:7]|, 0.5 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v12, s11 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_bfi_b32 v5, s14, v5, v12 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v8, s9 -; CI-NEXT: v_bfi_b32 v5, s14, v5, v8 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v8 ; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] -; CI-NEXT: s_mov_b32 s14, -1 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, ptr addrspace(1) %out @@ -387,125 +386,124 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) #0 { ; SI-LABEL: round_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s22 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s21, 0xfffff +; SI-NEXT: s_mov_b32 s20, s2 ; SI-NEXT: v_mov_b32_e32 v8, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s20, s7, 0xb0014 -; SI-NEXT: s_add_i32 s24, s20, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[20:21], s[0:1], s24 -; SI-NEXT: s_and_b32 s23, s7, 0x80000000 -; SI-NEXT: s_andn2_b64 s[20:21], s[6:7], s[20:21] -; SI-NEXT: s_cmp_lt_i32 s24, 0 -; SI-NEXT: s_cselect_b32 s20, 0, s20 -; SI-NEXT: s_cselect_b32 s21, s23, s21 -; SI-NEXT: s_cmp_gt_i32 s24, 51 -; SI-NEXT: s_cselect_b32 s24, s6, s20 -; SI-NEXT: s_cselect_b32 s25, s7, s21 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_mov_b32_e32 v1, s25 +; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], s3 +; SI-NEXT: s_and_b32 s24, s7, 0x80000000 +; SI-NEXT: s_andn2_b64 s[22:23], s[6:7], s[22:23] +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s22, 0, s22 +; SI-NEXT: s_cselect_b32 s23, s24, s23 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s22, s6, s22 +; SI-NEXT: s_cselect_b32 s23, s7, s23 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_mov_b32_e32 v1, s23 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 +; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b64 s[2:3], s[26:27], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 +; SI-NEXT: s_and_b64 s[24:25], s[24:25], exec +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014 +; SI-NEXT: s_add_i32 s24, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s24 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_and_b32 s23, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s25, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s24, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s23, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_cselect_b32 s7, s25, s7 +; SI-NEXT: s_cmp_gt_i32 s24, 51 ; SI-NEXT: s_cselect_b32 s6, s4, s6 -; SI-NEXT: v_bfi_b32 v9, s2, v0, v1 +; SI-NEXT: v_bfi_b32 v9, s3, v0, v1 ; SI-NEXT: s_cselect_b32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[24:25], v[8:9] -; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[2:3], s[22:23], v[8:9] +; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: s_and_b64 s[24:25], s[26:27], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: s_bfe_u32 s3, s11, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 +; SI-NEXT: s_and_b64 s[22:23], s[24:25], exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 +; SI-NEXT: s_add_i32 s22, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s22 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] ; SI-NEXT: s_and_b32 s23, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cmp_lt_i32 s22, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s23, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cmp_gt_i32 s22, 51 ; SI-NEXT: s_cselect_b32 s4, s10, s4 ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: v_bfi_b32 v9, s2, v4, v5 -; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s3, v4, v5 +; SI-NEXT: v_cmp_ge_f64_e64 s[22:23], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[8:9] -; SI-NEXT: s_and_b64 s[6:7], s[24:25], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v6, s3 -; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 +; SI-NEXT: s_and_b64 s[6:7], s[22:23], exec +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: s_bfe_u32 s6, s9, 0xb0014 +; SI-NEXT: s_add_i32 s10, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_andn2_b64 s[6:7], s[8:9], s[6:7] -; SI-NEXT: s_and_b32 s10, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s11, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s10, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s7, s11, s7 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: s_cselect_b32 s6, s8, s6 ; SI-NEXT: s_cselect_b32 s7, s9, s7 ; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_bfi_b32 v9, s3, v6, v7 ; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v6, v7 ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[8:9] ; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_bfe_u32 s4, s15, 0xb0014 +; SI-NEXT: s_add_i32 s8, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s8 +; SI-NEXT: v_mov_b32_e32 v10, s9 ; SI-NEXT: s_andn2_b64 s[4:5], s[14:15], s[4:5] -; SI-NEXT: s_and_b32 s8, s15, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s9, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s8, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s5, s9, s5 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s4, s14, s4 ; SI-NEXT: s_cselect_b32 s5, s15, s5 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: v_add_f64 v[4:5], s[14:15], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_bfi_b32 v9, s3, v9, v10 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v9, v10 ; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[8:9] ; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v12, s3 -; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: s_bfe_u32 s6, s13, 0xb0014 +; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s8 ; SI-NEXT: s_andn2_b64 s[6:7], s[12:13], s[6:7] -; SI-NEXT: s_and_b32 s8, s13, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s9, s13, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s8, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s7, s9, s7 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s7, s13, s7 ; SI-NEXT: s_cselect_b32 s6, s12, s6 ; SI-NEXT: v_mov_b32_e32 v10, s7 @@ -513,20 +511,20 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v13, s15 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v12, v13 +; SI-NEXT: v_bfi_b32 v9, s3, v12, v13 ; SI-NEXT: v_add_f64 v[12:13], s[4:5], v[8:9] ; SI-NEXT: s_and_b64 s[4:5], s[8:9], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v14, s3 -; SI-NEXT: s_bfe_u32 s3, s19, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_bfe_u32 s4, s19, 0xb0014 +; SI-NEXT: s_add_i32 s8, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s8 ; SI-NEXT: s_andn2_b64 s[4:5], s[18:19], s[4:5] -; SI-NEXT: s_and_b32 s8, s19, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s9, s19, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s8, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s5, s9, s5 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s5, s19, s5 ; SI-NEXT: s_cselect_b32 s4, s18, s4 ; SI-NEXT: v_mov_b32_e32 v10, s5 @@ -534,128 +532,129 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_add_f64 v[10:11], s[18:19], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v15, s13 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v14, v15 +; SI-NEXT: v_bfi_b32 v9, s3, v14, v15 ; SI-NEXT: v_add_f64 v[10:11], s[6:7], v[8:9] ; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s17, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 -; SI-NEXT: s_andn2_b64 s[0:1], s[16:17], s[0:1] -; SI-NEXT: s_and_b32 s6, s17, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s6, s1 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s1, s17, s1 -; SI-NEXT: s_cselect_b32 s0, s16, s0 -; SI-NEXT: v_mov_b32_e32 v15, s1 -; SI-NEXT: v_mov_b32_e32 v14, s0 +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: s_bfe_u32 s6, s17, 0xb0014 +; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s8 +; SI-NEXT: s_andn2_b64 s[6:7], s[16:17], s[6:7] +; SI-NEXT: s_and_b32 s9, s17, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s9, s7 +; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s7, s17, s7 +; SI-NEXT: s_cselect_b32 s6, s16, s6 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v14, s6 ; SI-NEXT: v_add_f64 v[14:15], s[16:17], -v[14:15] ; SI-NEXT: v_mov_b32_e32 v16, s19 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[14:15]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v9, v16 +; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[14:15]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s3, v9, v16 ; SI-NEXT: v_add_f64 v[16:17], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 +; SI-NEXT: s_and_b64 s[4:5], s[8:9], exec +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: v_mov_b32_e32 v14, s17 -; SI-NEXT: v_bfi_b32 v9, s2, v9, v14 -; SI-NEXT: v_add_f64 v[14:15], s[0:1], v[8:9] -; SI-NEXT: s_mov_b32 s23, 0xf000 +; SI-NEXT: v_bfi_b32 v9, s3, v9, v14 +; SI-NEXT: v_add_f64 v[14:15], s[6:7], v[8:9] +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 -; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; CI-NEXT: s_brev_b32 s22, -2 +; CI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; CI-NEXT: s_mov_b32 s23, 0xf000 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] ; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s7 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 ; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 -; CI-NEXT: v_bfi_b32 v5, s22, v8, v5 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 +; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v10, s5 ; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] -; CI-NEXT: v_bfi_b32 v5, s22, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[10:11], s[8:9], -v[6:7] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[14:15] ; CI-NEXT: v_mov_b32_e32 v12, s11 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_bfi_b32 v5, s22, v5, v12 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[12:13], s[14:15], -v[10:11] ; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v14, s9 -; CI-NEXT: v_bfi_b32 v5, s22, v5, v14 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v14 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[12:13] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[12:13], s[12:13], -v[14:15] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v16, s15 -; CI-NEXT: v_bfi_b32 v5, s22, v5, v16 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v16 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v18, s13 ; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[16:17] -; CI-NEXT: v_bfi_b32 v5, s22, v5, v18 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 ; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[18:19], s[16:17], -v[14:15] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[18:19]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v20, s19 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_bfi_b32 v5, s22, v5, v20 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v20 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v18, s17 -; CI-NEXT: v_bfi_b32 v5, s22, v5, v18 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 ; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] -; CI-NEXT: s_mov_b32 s22, -1 -; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index d5b4f879bf8a02..7ad7cc821c1b56 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-LABEL: round_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -24,39 +24,57 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_f32: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s6 -; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] -; GFX89-NEXT: s_brev_b32 s4, -2 -; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f32_e32 v0, s6 +; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX8-NEXT: s_brev_b32 s4, -2 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e32 v0, s2 +; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f32_e32 v0, s4 +; GFX11-NEXT: v_trunc_f32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v1, s4, v0 -; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v1|, 0.5 +; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0 +; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -91,7 +109,7 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #0 { ; GFX6-LABEL: round_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_brev_b32 s8, -2 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -117,7 +135,7 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX89-LABEL: round_v2f32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_brev_b32 s8, -2 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 @@ -143,7 +161,7 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX11-LABEL: round_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s3 ; GFX11-NEXT: v_trunc_f32_e32 v2, s2 @@ -198,8 +216,8 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #0 { ; GFX6-LABEL: round_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_brev_b32 s10, -2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -235,50 +253,89 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_v4f32: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: s_brev_b32 s10, -2 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s7 -; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX89-NEXT: v_mov_b32_e32 v2, s7 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s6 -; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s5 -; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] -; GFX89-NEXT: v_mov_b32_e32 v4, s5 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v4 -; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s4 -; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] -; GFX89-NEXT: v_mov_b32_e32 v5, s4 -; GFX89-NEXT: v_bfi_b32 v4, s10, v4, v5 -; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_v4f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_brev_b32 s10, -2 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f32_e32 v0, s7 +; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s6 +; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s5 +; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s4 +; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e32 v0, s7 +; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s6 +; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s5 +; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s4 +; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s7 @@ -355,145 +412,145 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #0 { ; GFX6-LABEL: round_v8f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; GFX6-NEXT: s_brev_b32 s14, -2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_trunc_f32_e32 v0, s7 ; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX6-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v3, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s6 ; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v2, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s5 ; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v4, s5 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v4 ; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s4 ; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX6-NEXT: v_bfi_b32 v4, s14, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, s11 ; GFX6-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v6, s11 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v7, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s10 ; GFX6-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v6, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s9 ; GFX6-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v8, s9 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v8 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v8 ; GFX6-NEXT: v_add_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s8 ; GFX6-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v9, s8 -; GFX6-NEXT: v_bfi_b32 v8, s2, v8, v9 +; GFX6-NEXT: v_bfi_b32 v8, s14, v8, v9 ; GFX6-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX89-LABEL: round_v8f32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX89-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; GFX89-NEXT: s_brev_b32 s2, -2 -; GFX89-NEXT: s_mov_b32 s15, 0xf000 -; GFX89-NEXT: s_mov_b32 s14, -1 +; GFX89-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX89-NEXT: s_brev_b32 s14, -2 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_trunc_f32_e32 v0, s7 ; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX89-NEXT: v_mov_b32_e32 v2, s7 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s6 ; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s5 ; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] ; GFX89-NEXT: v_mov_b32_e32 v4, s5 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v4 ; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s4 ; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] ; GFX89-NEXT: v_mov_b32_e32 v5, s4 -; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX89-NEXT: v_bfi_b32 v4, s14, v4, v5 ; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX89-NEXT: v_trunc_f32_e32 v4, s11 ; GFX89-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v6, s11 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX89-NEXT: v_add_f32_e32 v7, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s10 ; GFX89-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v6, s10 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX89-NEXT: v_add_f32_e32 v6, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s9 ; GFX89-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v8, s9 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v8 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v8 ; GFX89-NEXT: v_add_f32_e32 v5, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s8 ; GFX89-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v9, s8 -; GFX89-NEXT: v_bfi_b32 v8, s2, v8, v9 +; GFX89-NEXT: v_bfi_b32 v8, s14, v8, v9 ; GFX89-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_v8f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s7 @@ -628,10 +685,10 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-LABEL: round_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX6-NEXT: v_trunc_f32_e32 v1, v0 ; GFX6-NEXT: v_sub_f32_e32 v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, 0.5 @@ -642,44 +699,62 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX89-NEXT: s_movk_i32 s5, 0x7fff -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f16_e32 v1, s4 -; GFX89-NEXT: v_sub_f16_e32 v2, s4, v1 -; GFX89-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX89-NEXT: v_mov_b32_e32 v2, s4 -; GFX89-NEXT: v_bfi_b32 v0, s5, v0, v2 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: v_add_f16_e32 v0, v1, v0 -; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX8-NEXT: s_movk_i32 s5, 0x7fff +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f16_e32 v1, s4 +; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1 +; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e32 v1, s2 +; GFX9-NEXT: v_sub_f16_e32 v2, s2, v1 +; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v2 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f16_e32 v0, s4 +; GFX11-NEXT: v_trunc_f16_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v1, s4, v0 -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v1|, 0.5 +; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0 +; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -723,13 +798,13 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-LABEL: round_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb ; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_lshr_b32 s3, s2, 16 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX6-NEXT: v_trunc_f32_e32 v3, v1 ; GFX6-NEXT: v_sub_f32_e32 v5, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v0 @@ -748,14 +823,13 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: round_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX8-NEXT: s_movk_i32 s6, 0x7fff ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -782,57 +856,57 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; ; GFX9-LABEL: round_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_movk_i32 s1, 0x7fff +; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: v_trunc_f16_e32 v1, s5 -; GFX9-NEXT: v_sub_f16_e32 v2, s5, v1 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_trunc_f16_e32 v1, s0 +; GFX9-NEXT: v_sub_f16_e32 v2, s0, v1 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_bfi_b32 v2, s6, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_bfi_b32 v2, s1, v2, v3 ; GFX9-NEXT: v_add_f16_e32 v1, v1, v2 -; GFX9-NEXT: v_trunc_f16_e32 v2, s4 -; GFX9-NEXT: v_sub_f16_e32 v3, s4, v2 +; GFX9-NEXT: v_trunc_f16_e32 v2, s2 +; GFX9-NEXT: v_sub_f16_e32 v3, s2, v2 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_bfi_b32 v0, s6, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_bfi_b32 v0, s1, v0, v3 ; GFX9-NEXT: v_add_f16_e32 v0, v2, v0 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s5, s4, 16 -; GFX11-NEXT: v_trunc_f16_e32 v1, s4 -; GFX11-NEXT: v_trunc_f16_e32 v0, s5 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_trunc_f16_e32 v1, s2 +; GFX11-NEXT: v_trunc_f16_e32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f16_e32 v3, s4, v1 -; GFX11-NEXT: v_sub_f16_e32 v2, s5, v0 +; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1 +; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v2|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s2 +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v3|, 0.5 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s5 +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll index 70f15bd0aa6131..6a9c4c8d41c202 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll @@ -64,7 +64,7 @@ define amdgpu_gfx void @s_set_rounding(i32 inreg %rounding) { define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; GFX6-LABEL: s_set_rounding_kernel: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX6-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX6-NEXT: ;;#ASMSTART @@ -79,7 +79,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX7-LABEL: s_set_rounding_kernel: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s2, s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s2, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX7-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX7-NEXT: ;;#ASMSTART @@ -94,7 +94,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX8-LABEL: s_set_rounding_kernel: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX8-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX8-NEXT: ;;#ASMSTART @@ -109,7 +109,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX9-LABEL: s_set_rounding_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX9-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX9-NEXT: ;;#ASMSTART @@ -124,7 +124,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX10-LABEL: s_set_rounding_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX11-LABEL: s_set_rounding_kernel: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index a70f4d8d900650..2ce0a628686ea0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: sin_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -46,7 +46,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: sin_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -58,7 +58,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: sin_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -70,7 +70,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: sin_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -91,7 +91,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -121,7 +121,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: sin_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -142,7 +142,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: sin_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -158,7 +158,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: sin_v2f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -174,7 +174,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: sin_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll index c69ebedbec50b5..f2d57ba902e735 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -9,7 +9,7 @@ declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) define amdgpu_kernel void @sqrt_f16( ; SI-LABEL: sqrt_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @sqrt_f16( ; ; VI-LABEL: sqrt_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -47,7 +47,7 @@ define amdgpu_kernel void @sqrt_f16( ; ; GFX11-LABEL: sqrt_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -83,7 +83,7 @@ entry: define amdgpu_kernel void @sqrt_v2f16( ; SI-LABEL: sqrt_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -109,7 +109,7 @@ define amdgpu_kernel void @sqrt_v2f16( ; ; VI-LABEL: sqrt_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -129,7 +129,7 @@ define amdgpu_kernel void @sqrt_v2f16( ; ; GFX11-LABEL: sqrt_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index 11f5e6ebf99980..d1e2ddcdc6eacf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -9,7 +9,7 @@ declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) define amdgpu_kernel void @trunc_f16( ; SI-LABEL: trunc_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @trunc_f16( ; ; VI-LABEL: trunc_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -47,7 +47,7 @@ define amdgpu_kernel void @trunc_f16( ; ; GFX11-LABEL: trunc_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -84,7 +84,7 @@ entry: define amdgpu_kernel void @trunc_v2f16( ; SI-LABEL: trunc_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @trunc_v2f16( ; ; VI-LABEL: trunc_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -130,7 +130,7 @@ define amdgpu_kernel void @trunc_v2f16( ; ; GFX11-LABEL: trunc_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll index 029c4e51e29934..7c5ab1790c548c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { ; GFX6-LABEL: constant_load_v8f32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s16, s[10:11], 0x0 ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocaptur ; ; GFX12-LABEL: constant_load_v8f32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[8:9], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index 7202ab8b314669..cfaefca3a516d7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_f64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-HSA-LABEL: constant_load_f64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-NOHSA-LABEL: constant_load_f64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -47,7 +47,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -68,7 +68,7 @@ attributes #0 = { nounwind } define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { ; GFX6-NOHSA-LABEL: constant_load_2v4f64: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[24:25], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -92,7 +92,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX7-HSA-LABEL: constant_load_2v4f64: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -114,7 +114,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX8-NOHSA-LABEL: constant_load_2v4f64: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -136,7 +136,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX12-LABEL: constant_load_2v4f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[20:21], s[18:19], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 7178eaf2e73846..04fba9ef6d86df 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: constant_load_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -65,7 +65,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: constant_load_i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v2i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -101,7 +101,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v2i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -140,7 +140,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v2i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -157,7 +157,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v3i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -174,7 +174,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v3i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -212,7 +212,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v3i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -229,7 +229,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v4i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -246,7 +246,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v4i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -285,7 +285,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v4i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -302,7 +302,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v8i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -319,7 +319,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v8i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -358,7 +358,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v8i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -375,7 +375,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v16i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -392,7 +392,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v16i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -431,7 +431,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v16i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -448,7 +448,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v32i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -460,7 +460,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v32i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -488,7 +488,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v32i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -505,7 +505,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v64i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -518,7 +518,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v64i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -547,7 +547,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v64i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -565,7 +565,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_i1_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -582,7 +582,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_zextload_i1_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -611,7 +611,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i1_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -629,7 +629,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_i1_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -647,7 +647,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_sextload_i1_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -678,7 +678,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i1_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -698,7 +698,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -715,7 +715,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -744,7 +744,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -762,7 +762,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -780,7 +780,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -811,7 +811,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -831,7 +831,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -850,7 +850,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -884,7 +884,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] @@ -907,7 +907,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -926,7 +926,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -961,7 +961,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] @@ -983,7 +983,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1137,7 +1137,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1226,7 +1226,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1278,7 +1278,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1323,7 +1323,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] @@ -1350,7 +1350,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1376,7 +1376,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1443,7 +1443,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1507,7 +1507,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -1649,7 +1649,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1767,7 +1767,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] @@ -1827,7 +1827,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -1863,7 +1863,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1990,7 +1990,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] @@ -2043,7 +2043,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -2132,7 +2132,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2349,7 +2349,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2441,7 +2441,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -2530,7 +2530,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2770,7 +2770,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2859,7 +2859,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3025,7 +3025,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3614,7 +3614,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3780,7 +3780,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4241,7 +4241,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4400,7 +4400,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_i1_to_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4419,7 +4419,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_zextload_i1_to_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4451,7 +4451,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i1_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4471,7 +4471,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_i1_to_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4490,7 +4490,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_sextload_i1_to_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4523,7 +4523,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i1_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4543,7 +4543,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4562,7 +4562,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4594,7 +4594,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4614,7 +4614,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4633,7 +4633,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4666,7 +4666,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4686,7 +4686,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4707,7 +4707,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4745,7 +4745,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -4768,7 +4768,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4790,7 +4790,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4829,7 +4829,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] @@ -4854,7 +4854,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4878,7 +4878,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, v5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4931,7 +4931,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v5, s[2:3] @@ -4960,7 +4960,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4986,7 +4986,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5041,7 +5041,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v6, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v6, s[2:3] @@ -5072,7 +5072,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -5098,7 +5098,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5158,7 +5158,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -5192,7 +5192,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -5221,7 +5221,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5282,7 +5282,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] @@ -5317,7 +5317,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5353,7 +5353,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5453,7 +5453,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -5494,7 +5494,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5537,7 +5537,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5642,7 +5642,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v16, s[2:3] @@ -5687,7 +5687,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5746,7 +5746,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v6, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, v2 @@ -5930,7 +5930,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5997,7 +5997,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -6069,7 +6069,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -6263,7 +6263,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v32, s[2:3] @@ -6336,7 +6336,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -6443,7 +6443,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6779,7 +6779,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6898,7 +6898,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -7063,7 +7063,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -7444,7 +7444,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7574,7 +7574,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7777,7 +7777,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -8428,7 +8428,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8645,7 +8645,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -8968,7 +8968,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0 ; GFX8-NEXT: s_mov_b32 s13, s7 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -9715,7 +9715,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s19, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s5, s19 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 355c296d122ff2..a015a39a7184fc 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: constant_load_i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -38,7 +38,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: constant_load_i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -77,7 +77,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -95,7 +95,7 @@ entry: define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v2i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -119,7 +119,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v2i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -165,7 +165,7 @@ entry: define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v3i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -180,7 +180,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 4 @@ -198,7 +198,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v3i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s0, 4 @@ -252,7 +252,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -273,7 +273,7 @@ entry: define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v4i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -286,7 +286,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -299,7 +299,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v4i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -328,7 +328,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -347,7 +347,7 @@ entry: define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v8i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -362,7 +362,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -377,7 +377,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v8i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -408,7 +408,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -428,7 +428,7 @@ entry: define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v16i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 @@ -449,7 +449,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GCN-HSA-LABEL: constant_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -473,7 +473,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GCN-NOHSA-VI-LABEL: constant_load_v16i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s8, 16 @@ -522,7 +522,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -546,7 +546,7 @@ entry: define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #0 { ; GCN-NOHSA-SI-LABEL: constant_load_v16i16_align2: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -590,7 +590,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GCN-HSA-LABEL: constant_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 @@ -608,7 +608,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GCN-NOHSA-VI-LABEL: constant_load_v16i16_align2: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 14 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 @@ -742,7 +742,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GFX12-LABEL: constant_load_v16i16_align2: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0xf @@ -778,7 +778,7 @@ entry: define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -795,7 +795,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -808,7 +808,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -837,7 +837,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i16_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -855,7 +855,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -872,7 +872,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -885,7 +885,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -915,7 +915,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i16_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -933,7 +933,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -950,7 +950,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -963,7 +963,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -992,7 +992,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i16_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1010,7 +1010,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1040,7 +1040,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1070,7 +1070,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i16_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1103,7 +1103,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1118,7 +1118,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i16_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1175,7 +1175,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1205,7 +1205,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1240,7 +1240,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i16_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1262,7 +1262,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1281,7 +1281,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v3i16_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1361,7 +1361,7 @@ entry: define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1380,7 +1380,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 @@ -1440,7 +1440,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v3i16_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1465,7 +1465,7 @@ entry: define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1503,7 +1503,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i16_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1611,7 +1611,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1655,7 +1655,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i16_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1682,7 +1682,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1711,7 +1711,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1743,7 +1743,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i16_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1842,7 +1842,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1871,7 +1871,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1903,7 +1903,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1969,7 +1969,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i16_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2001,7 +2001,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -2050,7 +2050,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2108,7 +2108,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2219,7 +2219,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i16_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2265,7 +2265,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -2314,7 +2314,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2372,7 +2372,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2487,7 +2487,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i16_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2533,7 +2533,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -2622,7 +2622,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2732,7 +2732,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2937,7 +2937,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i16_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3013,7 +3013,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -3102,7 +3102,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3212,7 +3212,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3427,7 +3427,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i16_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3503,7 +3503,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 @@ -3672,7 +3672,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3888,7 +3888,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4290,7 +4290,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v64i16_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x0 @@ -4426,7 +4426,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x10 @@ -4595,7 +4595,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4811,7 +4811,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 @@ -5229,7 +5229,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v64i16_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x40 @@ -5365,7 +5365,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5383,7 +5383,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5397,7 +5397,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5429,7 +5429,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i16_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5453,7 +5453,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5471,7 +5471,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5485,7 +5485,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5519,7 +5519,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i16_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -5540,7 +5540,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5558,7 +5558,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5572,7 +5572,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5604,7 +5604,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i16_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5623,7 +5623,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5641,7 +5641,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5655,7 +5655,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5689,7 +5689,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i16_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -5710,7 +5710,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5727,7 +5727,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5744,7 +5744,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5782,7 +5782,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i16_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5805,7 +5805,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5823,7 +5823,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -5841,7 +5841,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -5882,7 +5882,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i16_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5905,7 +5905,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5928,7 +5928,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5954,7 +5954,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6009,7 +6009,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i16_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6037,7 +6037,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6064,7 +6064,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6094,7 +6094,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6156,7 +6156,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i16_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6187,7 +6187,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6222,7 +6222,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6266,7 +6266,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6357,7 +6357,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i16_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6394,7 +6394,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6438,7 +6438,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6492,7 +6492,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6598,7 +6598,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i16_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6642,7 +6642,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 @@ -6701,7 +6701,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6781,7 +6781,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6945,7 +6945,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i16_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7003,7 +7003,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -7081,7 +7081,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7187,7 +7187,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7381,7 +7381,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i16_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7452,7 +7452,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -7559,7 +7559,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7711,7 +7711,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -8026,7 +8026,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i16_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8124,7 +8124,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -8272,7 +8272,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8476,7 +8476,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -8854,7 +8854,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i16_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index f1a6bccc559f04..b0d8f72c22ba7a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-HSA-LABEL: constant_load_i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-NOHSA-LABEL: constant_load_i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -62,7 +62,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-HSA-LABEL: constant_load_i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -73,7 +73,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -91,7 +91,7 @@ entry: define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v2i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -104,7 +104,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v2i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v2i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -146,7 +146,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v2i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -158,7 +158,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -177,7 +177,7 @@ entry: define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -193,7 +193,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -207,7 +207,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -242,7 +242,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v3i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -255,7 +255,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -274,7 +274,7 @@ entry: define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v4i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -289,7 +289,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v4i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -304,7 +304,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v4i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -335,7 +335,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v4i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -349,7 +349,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -369,7 +369,7 @@ entry: define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v8i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -390,7 +390,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v8i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -414,7 +414,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v8i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 16 @@ -458,7 +458,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v8i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -477,7 +477,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -501,7 +501,7 @@ entry: define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v9i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s12, s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -526,7 +526,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v9i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -557,7 +557,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v9i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s12, s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -614,7 +614,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v9i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s12, s[10:11], 0x20 @@ -636,7 +636,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v9i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x20 @@ -663,7 +663,7 @@ entry: define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v10i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -689,7 +689,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v10i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -721,7 +721,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v10i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -780,7 +780,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v10i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x20 @@ -803,7 +803,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v10i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[12:13], s[10:11], 0x20 @@ -831,7 +831,7 @@ entry: define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v11i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -860,7 +860,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v11i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -893,7 +893,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v11i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -958,7 +958,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v11i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 @@ -982,7 +982,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v11i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20 @@ -1010,7 +1010,7 @@ entry: define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v12i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1038,7 +1038,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v12i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v12i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v12i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 @@ -1158,7 +1158,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v12i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[12:15], s[10:11], 0x20 @@ -1187,7 +1187,7 @@ entry: define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v16i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -1220,7 +1220,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v16i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48 @@ -1262,7 +1262,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v16i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_add_u32 s18, s16, 48 @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v16i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 @@ -1365,7 +1365,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1395,7 +1395,7 @@ entry: define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i32_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1421,7 +1421,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX8-NOHSA-LABEL: constant_zextload_i32_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1451,7 +1451,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX9-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1462,7 +1462,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i32_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i32_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1494,7 +1494,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1508,7 +1508,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX8-NOHSA-LABEL: constant_sextload_i32_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1539,7 +1539,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX9-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i32_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1586,7 +1586,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1599,7 +1599,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1629,7 +1629,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1640,7 +1640,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1658,7 +1658,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1672,7 +1672,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1686,7 +1686,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1717,7 +1717,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1730,7 +1730,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1751,7 +1751,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1766,7 +1766,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1781,7 +1781,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1816,7 +1816,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1849,7 +1849,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1866,7 +1866,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1884,7 +1884,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1924,7 +1924,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -1940,7 +1940,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1963,7 +1963,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1982,7 +1982,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2004,7 +2004,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2053,7 +2053,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2070,7 +2070,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2093,7 +2093,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2117,7 +2117,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2145,7 +2145,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2203,7 +2203,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -2226,7 +2226,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2254,7 +2254,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -2281,7 +2281,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2317,7 +2317,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2399,7 +2399,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2424,7 +2424,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2453,7 +2453,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -2491,7 +2491,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2543,7 +2543,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2644,7 +2644,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -2683,7 +2683,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2721,7 +2721,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -2788,7 +2788,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2888,7 +2888,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3075,7 +3075,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 @@ -3142,7 +3142,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3201,7 +3201,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -3244,7 +3244,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3314,7 +3314,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3462,7 +3462,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3503,7 +3503,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3544,7 +3544,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -3680,7 +3680,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4231,7 +4231,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4355,7 +4355,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 @@ -4459,7 +4459,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -4537,7 +4537,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4672,7 +4672,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4957,7 +4957,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5031,7 +5031,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 @@ -5098,7 +5098,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -5158,7 +5158,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -5241,7 +5241,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -5375,7 +5375,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v32i32: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -5426,7 +5426,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 46c7c2b08cd64b..66c73fda38743f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-LABEL: constant_load_i64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: constant_load_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -63,7 +63,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -81,7 +81,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v2i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -96,7 +96,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v2i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 @@ -111,7 +111,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v2i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 @@ -142,7 +142,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -162,7 +162,7 @@ entry: define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v3i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -182,7 +182,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v3i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -205,7 +205,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v3i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -253,7 +253,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10 @@ -278,7 +278,7 @@ entry: define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v4i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 @@ -299,7 +299,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v4i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_add_u32 s10, s8, 16 @@ -323,7 +323,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v4i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: s_add_u32 s10, s8, 16 @@ -372,7 +372,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -396,7 +396,7 @@ entry: define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v8i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 @@ -429,7 +429,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v8i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-NEXT: s_add_u32 s18, s16, 48 @@ -471,7 +471,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v8i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NEXT: s_add_u32 s18, s16, 48 @@ -558,7 +558,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -588,7 +588,7 @@ entry: define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v16i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NEXT: s_mov_b32 s39, 0xf000 @@ -648,7 +648,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: constant_load_v16i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -731,7 +731,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: constant_load_v16i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -899,7 +899,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 67a376b8c0f3c5..9000cee7ef9df0 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX7-HSA-LABEL: constant_load_i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -39,7 +39,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-NOHSA-LABEL: constant_load_i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -78,7 +78,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: constant_load_i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -96,7 +96,7 @@ entry: define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v2i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -113,7 +113,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v2i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -126,7 +126,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v2i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -165,7 +165,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v2i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -183,7 +183,7 @@ entry: define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v3i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -198,7 +198,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v3i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -217,7 +217,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v3i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -278,7 +278,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v3i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -298,7 +298,7 @@ entry: define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v4i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -310,7 +310,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v4i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -322,7 +322,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v4i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -350,7 +350,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v4i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -368,7 +368,7 @@ entry: define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v8i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -381,7 +381,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v8i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -394,7 +394,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v8i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -423,7 +423,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v8i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -442,7 +442,7 @@ entry: define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v16i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -457,7 +457,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v16i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -472,7 +472,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v16i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -503,7 +503,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v16i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -523,7 +523,7 @@ entry: define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -540,7 +540,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -553,7 +553,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -582,7 +582,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -600,7 +600,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -617,7 +617,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -630,7 +630,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -660,7 +660,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -678,7 +678,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -695,7 +695,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -708,7 +708,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -737,7 +737,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -755,7 +755,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -772,7 +772,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -785,7 +785,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -815,7 +815,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -834,7 +834,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -853,7 +853,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -868,7 +868,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -911,7 +911,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -934,7 +934,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -953,7 +953,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -968,7 +968,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1034,7 +1034,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1053,7 +1053,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1070,7 +1070,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1137,7 +1137,7 @@ entry: define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1156,7 +1156,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1216,7 +1216,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1242,7 +1242,7 @@ entry: define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1261,7 +1261,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1280,7 +1280,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1322,7 +1322,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1367,7 +1367,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1386,7 +1386,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1431,7 +1431,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1486,7 +1486,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1518,7 +1518,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1616,7 +1616,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1645,7 +1645,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1677,7 +1677,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 @@ -1747,7 +1747,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1782,7 +1782,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1831,7 +1831,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1889,7 +1889,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1999,7 +1999,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2046,7 +2046,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2095,7 +2095,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2153,7 +2153,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2275,7 +2275,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2326,7 +2326,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2415,7 +2415,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2525,7 +2525,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2724,7 +2724,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2802,7 +2802,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2891,7 +2891,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3001,7 +3001,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3231,7 +3231,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3315,7 +3315,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3483,7 +3483,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3699,7 +3699,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4086,7 +4086,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4222,7 +4222,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4390,7 +4390,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4604,7 +4604,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5047,7 +5047,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5195,7 +5195,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5213,7 +5213,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5227,7 +5227,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -5259,7 +5259,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -5279,7 +5279,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5297,7 +5297,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5311,7 +5311,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5345,7 +5345,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] @@ -5366,7 +5366,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5384,7 +5384,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5398,7 +5398,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5429,7 +5429,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5448,7 +5448,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5466,7 +5466,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5480,7 +5480,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5514,7 +5514,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] @@ -5535,7 +5535,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5556,7 +5556,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5573,7 +5573,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5620,7 +5620,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5642,7 +5642,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5664,7 +5664,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5682,7 +5682,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5731,7 +5731,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v4, s[2:3] @@ -5756,7 +5756,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -5779,7 +5779,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5805,7 +5805,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5863,7 +5863,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5892,7 +5892,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -5920,7 +5920,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5951,7 +5951,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6014,7 +6014,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6046,7 +6046,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6081,7 +6081,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6125,7 +6125,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6220,7 +6220,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6260,7 +6260,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6305,7 +6305,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6361,7 +6361,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_mov_b32 s5, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -6472,7 +6472,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -6518,7 +6518,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6577,7 +6577,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6657,7 +6657,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6826,7 +6826,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6887,7 +6887,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6967,7 +6967,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7074,7 +7074,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7275,7 +7275,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7347,7 +7347,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -7454,7 +7454,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7606,7 +7606,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7934,7 +7934,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8039,7 +8039,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8198,7 +8198,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8406,7 +8406,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8793,7 +8793,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8932,7 +8932,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -8949,7 +8949,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8962,7 +8962,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9000,7 +9000,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -9018,7 +9018,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9035,7 +9035,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9048,7 +9048,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9088,7 +9088,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] @@ -9106,7 +9106,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9123,7 +9123,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9136,7 +9136,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9174,7 +9174,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -9192,7 +9192,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9209,7 +9209,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9222,7 +9222,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9262,7 +9262,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] @@ -9280,7 +9280,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9300,7 +9300,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9316,7 +9316,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -9356,7 +9356,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -9379,7 +9379,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9401,7 +9401,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9419,7 +9419,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -9469,7 +9469,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -9492,7 +9492,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9512,7 +9512,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9532,7 +9532,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9598,7 +9598,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -9627,7 +9627,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9650,7 +9650,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9673,7 +9673,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9749,7 +9749,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -9777,7 +9777,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9806,7 +9806,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -9835,7 +9835,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -9940,7 +9940,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -9976,7 +9976,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -10011,7 +10011,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10046,7 +10046,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10171,7 +10171,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10206,7 +10206,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -10254,7 +10254,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10306,7 +10306,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10501,7 +10501,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10556,7 +10556,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -10617,7 +10617,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10681,7 +10681,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10912,7 +10912,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10968,7 +10968,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -11054,7 +11054,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11152,7 +11152,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11523,7 +11523,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -11616,7 +11616,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -11729,7 +11729,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11851,7 +11851,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -12297,7 +12297,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 142bc37fdeb755..21e27bfa75531d 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-HSA-LABEL: global_load_i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -40,7 +40,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-NOHSA-VI-LABEL: global_load_i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -115,7 +115,7 @@ entry: define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v2i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -132,7 +132,7 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -145,7 +145,7 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v2i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -200,7 +200,7 @@ entry: define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v3i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -218,7 +218,7 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -236,7 +236,7 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v3i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -332,7 +332,7 @@ entry: define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v4i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -349,7 +349,7 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -362,7 +362,7 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v4i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -417,7 +417,7 @@ entry: define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v8i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -434,7 +434,7 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -447,7 +447,7 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v8i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -502,7 +502,7 @@ entry: define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v16i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -522,7 +522,7 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -546,7 +546,7 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; ; GCN-NOHSA-VI-LABEL: global_load_v16i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -622,7 +622,7 @@ entry: define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; GCN-NOHSA-SI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, s10 @@ -672,7 +672,7 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; ; GCN-HSA-LABEL: global_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -696,7 +696,7 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; ; GCN-NOHSA-VI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -804,7 +804,7 @@ entry: define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -821,7 +821,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -834,7 +834,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -889,7 +889,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -906,7 +906,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -919,7 +919,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -977,7 +977,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -994,7 +994,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1007,7 +1007,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1079,7 +1079,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1092,7 +1092,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1150,7 +1150,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1169,7 +1169,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1184,7 +1184,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1249,7 +1249,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1268,7 +1268,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1283,7 +1283,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1369,7 +1369,7 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1385,7 +1385,7 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1458,7 +1458,7 @@ entry: define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1495,7 +1495,7 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1575,7 +1575,7 @@ entry: define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1689,7 +1689,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1711,7 +1711,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1729,7 +1729,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1833,7 +1833,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1859,7 +1859,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1956,7 +1956,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1982,7 +1982,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2008,7 +2008,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -2108,7 +2108,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2146,7 +2146,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2196,7 +2196,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -2344,7 +2344,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2382,7 +2382,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2432,7 +2432,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -2591,7 +2591,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2653,7 +2653,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2751,7 +2751,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -3002,7 +3002,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -3064,7 +3064,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3162,7 +3162,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -3450,9 +3450,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3 ; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -3579,7 +3579,7 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3772,14 +3772,14 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s3 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s9 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 @@ -4260,13 +4260,13 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN-NOHSA-SI-NEXT: s_add_u32 s8, s8, s3 +; GCN-NOHSA-SI-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4289,11 +4289,11 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 @@ -4370,17 +4370,17 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -4573,14 +4573,14 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s3 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s9 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 @@ -5126,7 +5126,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5144,7 +5144,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5158,7 +5158,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5223,7 +5223,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5241,7 +5241,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5255,7 +5255,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5318,7 +5318,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5336,7 +5336,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5350,7 +5350,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5410,7 +5410,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5428,7 +5428,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5442,7 +5442,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5505,7 +5505,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5526,7 +5526,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5543,7 +5543,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5613,7 +5613,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5635,7 +5635,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5653,7 +5653,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5727,7 +5727,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5753,7 +5753,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5779,7 +5779,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5871,7 +5871,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5898,7 +5898,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5925,7 +5925,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -6022,7 +6022,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6058,7 +6058,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 @@ -6102,7 +6102,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6240,7 +6240,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6277,7 +6277,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6322,7 +6322,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6469,7 +6469,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6527,7 +6527,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 @@ -6613,7 +6613,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6847,7 +6847,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6907,7 +6907,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6995,7 +6995,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -7247,9 +7247,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3 ; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, 0 @@ -7374,7 +7374,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -7529,7 +7529,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -7965,7 +7965,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -8076,7 +8076,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8250,7 +8250,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index c0649322c81953..0f9cc33d731f12 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCNX3-HSA-LABEL: global_load_i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -39,7 +39,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCNX3-NOHSA-LABEL: global_load_i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-HSA-LABEL: global_load_i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v1, v0, s[2:3] @@ -88,7 +88,7 @@ entry: define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v2i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -105,7 +105,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v2i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -118,7 +118,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v2i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -151,7 +151,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v2i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -167,7 +167,7 @@ entry: define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v3i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -185,7 +185,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v3i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v3i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -236,7 +236,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3] @@ -252,7 +252,7 @@ entry: define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v4i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -269,7 +269,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v4i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v4i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -315,7 +315,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v4i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -331,7 +331,7 @@ entry: define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v8i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -351,7 +351,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v8i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -375,7 +375,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v8i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -415,7 +415,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v8i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 @@ -434,7 +434,7 @@ entry: define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v9i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -457,7 +457,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v9i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -492,7 +492,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v9i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -543,7 +543,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v9i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] @@ -565,7 +565,7 @@ entry: define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v10i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -588,7 +588,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v10i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -623,7 +623,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v10i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -672,7 +672,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v10i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v10, s[2:3] @@ -694,7 +694,7 @@ entry: define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v11i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -718,7 +718,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v11i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -753,7 +753,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v11i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -807,7 +807,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v11i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3] @@ -830,7 +830,7 @@ entry: define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v12i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -853,7 +853,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v12i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -888,7 +888,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v12i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -938,7 +938,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v12i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] @@ -960,7 +960,7 @@ entry: define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v16i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -986,7 +986,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v16i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v16i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1090,7 +1090,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v16i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] offset:32 @@ -1115,7 +1115,7 @@ entry: define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_i32_to_i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i32_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3] @@ -1198,7 +1198,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_i32_to_i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1216,7 +1216,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1230,7 +1230,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i32_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3] @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v1i32_to_v1i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1300,7 +1300,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1314,7 +1314,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1349,7 +1349,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3] @@ -1365,7 +1365,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v1i32_to_v1i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1383,7 +1383,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3] @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v2i32_to_v2i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1470,7 +1470,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1487,7 +1487,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1528,7 +1528,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3] @@ -1547,7 +1547,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v2i32_to_v2i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1567,7 +1567,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1625,7 +1625,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] @@ -1644,7 +1644,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v4i32_to_v4i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1669,7 +1669,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1694,7 +1694,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1769,7 +1769,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v4i32_to_v4i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1795,7 +1795,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1821,7 +1821,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1877,7 +1877,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3] @@ -1902,7 +1902,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1936,7 +1936,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1981,7 +1981,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2091,7 +2091,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v8i32_to_v8i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -2129,7 +2129,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2179,7 +2179,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v23, s[2:3] @@ -2303,7 +2303,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2365,7 +2365,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2463,7 +2463,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2613,7 +2613,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32 @@ -2674,7 +2674,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v16i32_to_v16i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2726,7 +2726,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2810,7 +2810,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2941,7 +2941,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2995,9 +2995,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NOHSA-NEXT: s_mov_b32 s14, -1 ; SI-NOHSA-NEXT: s_mov_b32 s15, 0xe8f000 -; SI-NOHSA-NEXT: s_add_u32 s12, s12, s9 +; SI-NOHSA-NEXT: s_add_u32 s12, s12, s3 ; SI-NOHSA-NEXT: s_addc_u32 s13, s13, 0 -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -3112,7 +3112,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3305,7 +3305,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -3580,12 +3580,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCN-GFX900-HSA: ; %bb.0: -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-GFX900-HSA-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-GFX900-HSA-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s13 -; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0 +; GCN-GFX900-HSA-NEXT: s_add_u32 s8, s8, s7 +; GCN-GFX900-HSA-NEXT: s_addc_u32 s9, s9, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112 @@ -3611,11 +3611,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[8:11], 0 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v10 @@ -3654,11 +3654,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51 @@ -3695,7 +3695,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCN-GFX908-HSA: ; %bb.0: -; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 @@ -3811,7 +3811,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v32i32_to_v32i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0 @@ -3899,7 +3899,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4064,7 +4064,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -4303,7 +4303,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4389,7 +4389,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v32i32: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -4423,7 +4423,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v32i32: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4515,7 +4515,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v32i32: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -4603,7 +4603,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/local-64.ll b/llvm/test/CodeGen/AMDGPU/local-64.ll index a71418f3dbf5ba..26b559ae6fa9a9 100644 --- a/llvm/test/CodeGen/AMDGPU/local-64.ll +++ b/llvm/test/CodeGen/AMDGPU/local-64.ll @@ -9,7 +9,7 @@ ; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 ; GCN: buffer_store_dword [[REG]], -define amdgpu_kernel void @local_i32_load(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i32_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %gep = getelementptr i32, ptr addrspace(3) %in, i32 7 %val = load i32, ptr addrspace(3) %gep, align 4 store i32 %val, ptr addrspace(1) %out, align 4 @@ -22,7 +22,7 @@ define amdgpu_kernel void @local_i32_load(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} ; GCN: buffer_store_dword [[REG]], -define amdgpu_kernel void @local_i32_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i32_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %val = load i32, ptr addrspace(3) %in, align 4 store i32 %val, ptr addrspace(1) %out, align 4 ret void @@ -35,7 +35,7 @@ define amdgpu_kernel void @local_i32_load_0_offset(ptr addrspace(1) %out, ptr ad ; GCN-NOT: add ; GCN: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 ; GCN: buffer_store_byte [[REG]], -define amdgpu_kernel void @local_i8_load_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i8_load_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %gep = getelementptr i8, ptr addrspace(3) %in, i32 65535 %val = load i8, ptr addrspace(3) %gep, align 4 store i8 %val, ptr addrspace(1) %out, align 4 @@ -56,7 +56,7 @@ define amdgpu_kernel void @local_i8_load_i16_max_offset(ptr addrspace(1) %out, p ; GCN-DAG: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] ; GCN: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] ; GCN: buffer_store_byte [[REG]], -define amdgpu_kernel void @local_i8_load_over_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i8_load_over_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %gep = getelementptr i8, ptr addrspace(3) %in, i32 65536 %val = load i8, ptr addrspace(3) %gep, align 4 store i8 %val, ptr addrspace(1) %out, align 4 @@ -70,7 +70,7 @@ define amdgpu_kernel void @local_i8_load_over_i16_max_offset(ptr addrspace(1) %o ; GCN-NOT: add ; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_i64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %gep = getelementptr i64, ptr addrspace(3) %in, i32 7 %val = load i64, ptr addrspace(3) %gep, align 8 store i64 %val, ptr addrspace(1) %out, align 8 @@ -83,7 +83,7 @@ define amdgpu_kernel void @local_i64_load(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_i64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %val = load i64, ptr addrspace(3) %in, align 8 store i64 %val, ptr addrspace(1) %out, align 8 ret void @@ -96,7 +96,7 @@ define amdgpu_kernel void @local_i64_load_0_offset(ptr addrspace(1) %out, ptr ad ; GCN-NOT: add ; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_f64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_f64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %gep = getelementptr double, ptr addrspace(3) %in, i32 7 %val = load double, ptr addrspace(3) %gep, align 8 store double %val, ptr addrspace(1) %out, align 8 @@ -109,7 +109,7 @@ define amdgpu_kernel void @local_f64_load(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_f64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_f64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %val = load double, ptr addrspace(3) %in, align 8 store double %val, ptr addrspace(1) %out, align 8 ret void @@ -121,7 +121,7 @@ define amdgpu_kernel void @local_f64_load_0_offset(ptr addrspace(1) %out, ptr ad ; GCN-NOT: add ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define amdgpu_kernel void @local_i64_store(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_i64_store(ptr addrspace(3) %out) nounwind { %gep = getelementptr i64, ptr addrspace(3) %out, i32 7 store i64 5678, ptr addrspace(3) %gep, align 8 ret void @@ -133,7 +133,7 @@ define amdgpu_kernel void @local_i64_store(ptr addrspace(3) %out) #0 { ; GCN-NOT: add ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @local_i64_store_0_offset(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_i64_store_0_offset(ptr addrspace(3) %out) nounwind { store i64 1234, ptr addrspace(3) %out, align 8 ret void } @@ -144,7 +144,7 @@ define amdgpu_kernel void @local_i64_store_0_offset(ptr addrspace(3) %out) #0 { ; GCN-NOT: add ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define amdgpu_kernel void @local_f64_store(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_f64_store(ptr addrspace(3) %out) nounwind { %gep = getelementptr double, ptr addrspace(3) %out, i32 7 store double 16.0, ptr addrspace(3) %gep, align 8 ret void @@ -155,7 +155,7 @@ define amdgpu_kernel void @local_f64_store(ptr addrspace(3) %out) #0 { ; GFX9-NOT: m0 ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @local_f64_store_0_offset(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_f64_store_0_offset(ptr addrspace(3) %out) nounwind { store double 20.0, ptr addrspace(3) %out, align 8 ret void } @@ -168,7 +168,7 @@ define amdgpu_kernel void @local_f64_store_0_offset(ptr addrspace(3) %out) #0 { ; SI: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 ; CIPLUS: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 ; GCN: s_endpgm -define amdgpu_kernel void @local_v2i64_store(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_v2i64_store(ptr addrspace(3) %out) nounwind { %gep = getelementptr <2 x i64>, ptr addrspace(3) %out, i32 7 store <2 x i64> , ptr addrspace(3) %gep, align 16 ret void @@ -184,7 +184,7 @@ define amdgpu_kernel void @local_v2i64_store(ptr addrspace(3) %out) #0 { ; CIPLUS: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}} ; GCN: s_endpgm -define amdgpu_kernel void @local_v2i64_store_0_offset(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_v2i64_store_0_offset(ptr addrspace(3) %out) nounwind { store <2 x i64> , ptr addrspace(3) %out, align 16 ret void } @@ -201,7 +201,7 @@ define amdgpu_kernel void @local_v2i64_store_0_offset(ptr addrspace(3) %out) #0 ; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240{{$}} ; GCN: s_endpgm -define amdgpu_kernel void @local_v4i64_store(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_v4i64_store(ptr addrspace(3) %out) nounwind { %gep = getelementptr <4 x i64>, ptr addrspace(3) %out, i32 7 store <4 x i64> , ptr addrspace(3) %gep, align 16 ret void @@ -219,9 +219,7 @@ define amdgpu_kernel void @local_v4i64_store(ptr addrspace(3) %out) #0 { ; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16{{$}} ; GCN: s_endpgm -define amdgpu_kernel void @local_v4i64_store_0_offset(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_v4i64_store_0_offset(ptr addrspace(3) %out) nounwind { store <4 x i64> , ptr addrspace(3) %out, align 16 ret void } - -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index d8a790c7184084..03ee6a325fbbc3 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -6979,28 +6979,28 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; GFX12-LABEL: local_ds_fadd: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 ; GFX12-NEXT: s_mov_b32 s6, exec_lo ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s1, s5, 4 -; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX12-NEXT: s_add_co_i32 s3, s5, 4 +; GFX12-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB28_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX12-NEXT: s_lshl_b32 s5, s1, 3 +; GFX12-NEXT: s_lshl_b32 s5, s3, 3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -7009,20 +7009,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX12-NEXT: s_cbranch_execz .LBB28_4 ; GFX12-NEXT: ; %bb.3: -; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7 +; GFX12-NEXT: s_bcnt1_i32_b32 s2, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX12-NEXT: s_lshl_b32 s0, s1, 4 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX12-NEXT: s_lshl_b32 s2, s3, 4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_f32 v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_4: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_brev_b32 s0, 1 +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_brev_b32 s2, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX12-NEXT: v_add_f32_e32 v0, s5, v0 @@ -7031,32 +7031,32 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: .LBB28_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_ctz_i32_b32 s5, s1 +; GFX12-NEXT: s_ctz_i32_b32 s5, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s5 ; GFX12-NEXT: s_lshl_b32 s7, 1, s5 -; GFX12-NEXT: v_writelane_b32 v0, s0, s5 -; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 +; GFX12-NEXT: v_writelane_b32 v0, s2, s5 +; GFX12-NEXT: s_and_not1_b32 s3, s3, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_lg_u32 s1, 0 -; GFX12-NEXT: s_add_f32 s0, s0, s6 +; GFX12-NEXT: s_cmp_lg_u32 s3, 0 +; GFX12-NEXT: s_add_f32 s2, s2, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: ; implicit-def: $vgpr1 -; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12-NEXT: s_cbranch_execz .LBB28_8 ; GFX12-NEXT: ; %bb.7: -; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s2 ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_8: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 @@ -7069,10 +7069,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX940-LABEL: local_ds_fadd: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_add_i32 s5, s5, 4 @@ -7080,9 +7080,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX940-NEXT: s_cbranch_execz .LBB28_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX940-NEXT: s_lshl_b32 s8, s5, 3 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7093,15 +7093,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: v_readfirstlane_b32 s10, v1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX940-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX940-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX940-NEXT: s_cbranch_execz .LBB28_4 ; GFX940-NEXT: ; %bb.3: -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX940-NEXT: s_lshl_b32 s0, s5, 4 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX940-NEXT: s_lshl_b32 s2, s5, 4 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f32 v2, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB28_4: @@ -7110,20 +7110,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX940-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, s10 -; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX940-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX940-NEXT: ; implicit-def: $vgpr0 ; GFX940-NEXT: .LBB28_5: ; %ComputeLoop ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX940-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX940-NEXT: v_readfirstlane_b32 s8, v1 ; GFX940-NEXT: v_readlane_b32 s9, v2, s5 ; GFX940-NEXT: s_mov_b32 m0, s5 -; GFX940-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX940-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX940-NEXT: v_writelane_b32 v0, s8, m0 -; GFX940-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX940-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX940-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX940-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX940-NEXT: ; %bb.6: ; %ComputeEnd @@ -7131,16 +7131,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX940-NEXT: ; implicit-def: $vgpr2 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX940-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX940-NEXT: s_cbranch_execz .LBB28_8 ; GFX940-NEXT: ; %bb.7: ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB28_8: -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-NEXT: v_readfirstlane_b32 s2, v2 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_nop 0 @@ -7153,28 +7153,28 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX11-LABEL: local_ds_fadd: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 ; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s1, s5, 4 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_add_i32 s3, s5, 4 +; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX11-NEXT: s_lshl_b32 s5, s1, 3 +; GFX11-NEXT: s_lshl_b32 s5, s3, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB28_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 s7, exec_lo ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7183,12 +7183,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX11-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-NEXT: ; %bb.3: -; GFX11-NEXT: s_bcnt1_i32_b32 s0, s7 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX11-NEXT: s_lshl_b32 s0, s1, 4 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX11-NEXT: s_lshl_b32 s2, s3, 4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_f32 v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -7196,7 +7196,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX11-NEXT: v_add_f32_e32 v0, s5, v0 @@ -7205,25 +7205,25 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: .LBB28_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_ctz_i32_b32 s1, s0 +; GFX11-NEXT: s_ctz_i32_b32 s3, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readlane_b32 s6, v2, s1 -; GFX11-NEXT: s_lshl_b32 s7, 1, s1 +; GFX11-NEXT: v_readlane_b32 s6, v2, s3 +; GFX11-NEXT: s_lshl_b32 s7, 1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 -; GFX11-NEXT: v_writelane_b32 v0, s5, s1 +; GFX11-NEXT: s_and_not1_b32 s2, s2, s7 +; GFX11-NEXT: v_writelane_b32 v0, s5, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: ; implicit-def: $vgpr2 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execz .LBB28_8 ; GFX11-NEXT: ; %bb.7: ; GFX11-NEXT: v_mov_b32_e32 v2, s4 @@ -7231,8 +7231,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB28_8: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 @@ -7245,19 +7245,19 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX10-LABEL: local_ds_fadd: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s1, s5, 4 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_add_i32 s3, s5, 4 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB28_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s1, 3 +; GFX10-NEXT: s_lshl_b32 s5, s3, 3 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7265,18 +7265,18 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB28_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_mov_b32 s7, exec_lo ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 -; GFX10-NEXT: s_and_saveexec_b32 s6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2 +; GFX10-NEXT: s_and_saveexec_b32 s6, s2 ; GFX10-NEXT: s_cbranch_execz .LBB28_4 ; GFX10-NEXT: ; %bb.3: -; GFX10-NEXT: s_bcnt1_i32_b32 s0, s7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s1, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_bcnt1_i32_b32 s2, s7 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX10-NEXT: s_lshl_b32 s2, s3, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_add_f32 v2, v1 @@ -7287,28 +7287,28 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX10-NEXT: s_mov_b32 s0, exec_lo +; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, s5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s5, vcc_lo ; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: .LBB28_5: ; %ComputeLoop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_ff1_i32_b32 s1, s0 +; GFX10-NEXT: s_ff1_i32_b32 s3, s2 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-NEXT: v_readlane_b32 s6, v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s7 -; GFX10-NEXT: v_writelane_b32 v0, s5, s1 +; GFX10-NEXT: v_readlane_b32 s6, v2, s3 +; GFX10-NEXT: s_lshl_b32 s7, 1, s3 +; GFX10-NEXT: s_andn2_b32 s2, s2, s7 +; GFX10-NEXT: v_writelane_b32 v0, s5, s3 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: ; implicit-def: $vgpr2 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execz .LBB28_8 ; GFX10-NEXT: ; %bb.7: ; GFX10-NEXT: v_mov_b32_e32 v2, s4 @@ -7318,9 +7318,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB28_8: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_f32_e32 v0, s2, v0 @@ -7331,10 +7330,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX90A-LABEL: local_ds_fadd: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s5, s5, 4 @@ -7342,9 +7341,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB28_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: s_lshl_b32 s8, s5, 3 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7355,15 +7354,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX90A-NEXT: s_cbranch_execz .LBB28_4 ; GFX90A-NEXT: ; %bb.3: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX90A-NEXT: s_lshl_b32 s0, s5, 4 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX90A-NEXT: s_lshl_b32 s2, s5, 4 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f32 v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB28_4: @@ -7372,20 +7371,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX90A-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s10 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_mov_b64 s[2:3], exec ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX90A-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 ; GFX90A-NEXT: v_readlane_b32 s9, v2, s5 ; GFX90A-NEXT: s_mov_b32 m0, s5 -; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd @@ -7393,16 +7392,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX90A-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execz .LBB28_8 ; GFX90A-NEXT: ; %bb.7: ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB28_8: -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 ; GFX90A-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 @@ -7414,10 +7413,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX908-LABEL: local_ds_fadd: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s5, s5, 4 @@ -7425,9 +7424,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_cbranch_execz .LBB28_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX908-NEXT: s_lshl_b32 s8, s5, 3 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7438,15 +7437,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_readfirstlane_b32 s10, v1 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX908-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_cbranch_execz .LBB28_4 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX908-NEXT: s_lshl_b32 s0, s5, 4 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX908-NEXT: s_lshl_b32 s2, s5, 4 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, s0 +; GFX908-NEXT: v_mov_b32_e32 v2, s2 ; GFX908-NEXT: ds_add_f32 v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: .LBB28_4: @@ -7455,20 +7454,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX908-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX908-NEXT: v_mov_b32_e32 v1, s10 -; GFX908-NEXT: s_mov_b64 s[0:1], exec +; GFX908-NEXT: s_mov_b64 s[2:3], exec ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX908-NEXT: ; implicit-def: $vgpr0 ; GFX908-NEXT: .LBB28_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX908-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX908-NEXT: v_readfirstlane_b32 s8, v1 ; GFX908-NEXT: v_readlane_b32 s9, v2, s5 ; GFX908-NEXT: s_mov_b32 m0, s5 -; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX908-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX908-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd @@ -7476,16 +7475,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX908-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX908-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX908-NEXT: s_cbranch_execz .LBB28_8 ; GFX908-NEXT: ; %bb.7: ; GFX908-NEXT: v_mov_b32_e32 v2, s4 ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: .LBB28_8: -; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX908-NEXT: v_readfirstlane_b32 s2, v2 ; GFX908-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX908-NEXT: v_mov_b32_e32 v2, s2 @@ -7497,10 +7496,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX8-LABEL: local_ds_fadd: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s5, s5, 4 @@ -7509,9 +7508,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_cbranch_execz .LBB28_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_lshl_b32 s8, s5, 3 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7522,15 +7521,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_readfirstlane_b32 s10, v1 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB28_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX8-NEXT: s_lshl_b32 s0, s5, 4 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX8-NEXT: s_lshl_b32 s2, s5, 4 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: ds_add_f32 v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_4: @@ -7539,20 +7538,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: .LBB28_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_readfirstlane_b32 s8, v1 ; GFX8-NEXT: v_readlane_b32 s9, v2, s5 ; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd @@ -7560,8 +7559,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr2 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB28_8 ; GFX8-NEXT: ; %bb.7: ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -7569,8 +7568,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_8: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 @@ -7583,10 +7582,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX7-LABEL: local_ds_fadd: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 -; GFX7-NEXT: s_mov_b64 s[0:1], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s5, s5, 4 @@ -7598,8 +7597,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_lshl_b32 s8, s5, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: ds_read_b32 v1, v2 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_2: ; %atomicrmw.start @@ -7609,8 +7608,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 -; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 +; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_2 ; GFX7-NEXT: ; %bb.3: ; %Flow18 @@ -7621,15 +7620,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_cbranch_execz .LBB28_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s0, s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshl_b32 s2, s5, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: ds_read_b32 v3, v1 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2 @@ -7638,8 +7637,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 @@ -7652,7 +7651,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_add_f32_e32 v2, s10, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: .LBB28_8: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7661,12 +7660,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX7-NEXT: s_cbranch_execnz .LBB28_8 ; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7675,10 +7674,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX6-LABEL: local_ds_fadd: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 -; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s5, s5, 4 @@ -7690,8 +7689,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_lshl_b32 s8, s5, 3 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ds_read_b32 v1, v2 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_2: ; %atomicrmw.start @@ -7701,8 +7700,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 -; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 +; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_2 ; GFX6-NEXT: ; %bb.3: ; %Flow16 @@ -7713,15 +7712,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB28_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s0, s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshl_b32 s2, s5, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: ds_read_b32 v3, v1 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2 @@ -7730,8 +7729,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 @@ -7744,7 +7743,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_add_f32_e32 v2, s10, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: .LBB28_8: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7753,12 +7752,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX6-NEXT: s_cbranch_execnz .LBB28_8 ; GFX6-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7779,26 +7778,26 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; GFX12-LABEL: local_ds_fadd_one_as: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 ; GFX12-NEXT: s_mov_b32 s6, exec_lo ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s1, s5, 4 -; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX12-NEXT: s_add_co_i32 s3, s5, 4 +; GFX12-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB29_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX12-NEXT: s_lshl_b32 s5, s1, 3 +; GFX12-NEXT: s_lshl_b32 s5, s3, 3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX12-NEXT: .LBB29_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7808,18 +7807,18 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX12-NEXT: s_cbranch_execz .LBB29_4 ; GFX12-NEXT: ; %bb.3: -; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7 +; GFX12-NEXT: s_bcnt1_i32_b32 s2, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX12-NEXT: s_lshl_b32 s0, s1, 4 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX12-NEXT: s_lshl_b32 s2, s3, 4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_f32 v2, v1 ; GFX12-NEXT: .LBB29_4: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_brev_b32 s0, 1 +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_brev_b32 s2, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX12-NEXT: v_add_f32_e32 v0, s5, v0 @@ -7828,30 +7827,30 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: .LBB29_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_ctz_i32_b32 s5, s1 +; GFX12-NEXT: s_ctz_i32_b32 s5, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s5 ; GFX12-NEXT: s_lshl_b32 s7, 1, s5 -; GFX12-NEXT: v_writelane_b32 v0, s0, s5 -; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 +; GFX12-NEXT: v_writelane_b32 v0, s2, s5 +; GFX12-NEXT: s_and_not1_b32 s3, s3, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_lg_u32 s1, 0 -; GFX12-NEXT: s_add_f32 s0, s0, s6 +; GFX12-NEXT: s_cmp_lg_u32 s3, 0 +; GFX12-NEXT: s_add_f32 s2, s2, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: ; implicit-def: $vgpr1 -; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12-NEXT: s_cbranch_execz .LBB29_8 ; GFX12-NEXT: ; %bb.7: -; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s2 ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: .LBB29_8: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7865,10 +7864,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX940-LABEL: local_ds_fadd_one_as: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_add_i32 s5, s5, 4 @@ -7876,9 +7875,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX940-NEXT: s_cbranch_execz .LBB29_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX940-NEXT: s_lshl_b32 s8, s5, 3 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7889,15 +7888,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: v_readfirstlane_b32 s10, v1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX940-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX940-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX940-NEXT: s_cbranch_execz .LBB29_4 ; GFX940-NEXT: ; %bb.3: -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX940-NEXT: s_lshl_b32 s0, s5, 4 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX940-NEXT: s_lshl_b32 s2, s5, 4 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f32 v2, v1 ; GFX940-NEXT: .LBB29_4: ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] @@ -7905,20 +7904,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX940-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, s10 -; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX940-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX940-NEXT: ; implicit-def: $vgpr0 ; GFX940-NEXT: .LBB29_5: ; %ComputeLoop ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX940-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX940-NEXT: v_readfirstlane_b32 s8, v1 ; GFX940-NEXT: v_readlane_b32 s9, v2, s5 ; GFX940-NEXT: s_mov_b32 m0, s5 -; GFX940-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX940-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX940-NEXT: v_writelane_b32 v0, s8, m0 -; GFX940-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX940-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX940-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX940-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX940-NEXT: ; %bb.6: ; %ComputeEnd @@ -7926,15 +7925,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX940-NEXT: ; implicit-def: $vgpr2 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX940-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX940-NEXT: s_cbranch_execz .LBB29_8 ; GFX940-NEXT: ; %bb.7: ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX940-NEXT: .LBB29_8: -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_readfirstlane_b32 s2, v2 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 @@ -7947,26 +7946,26 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: local_ds_fadd_one_as: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 ; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s1, s5, 4 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_add_i32 s3, s5, 4 +; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11-NEXT: s_cbranch_execz .LBB29_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX11-NEXT: s_lshl_b32 s5, s1, 3 +; GFX11-NEXT: s_lshl_b32 s5, s3, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX11-NEXT: .LBB29_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 s7, exec_lo ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -7976,18 +7975,18 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX11-NEXT: s_cbranch_execz .LBB29_4 ; GFX11-NEXT: ; %bb.3: -; GFX11-NEXT: s_bcnt1_i32_b32 s0, s7 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX11-NEXT: s_lshl_b32 s0, s1, 4 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX11-NEXT: s_lshl_b32 s2, s3, 4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_f32 v2, v1 ; GFX11-NEXT: .LBB29_4: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX11-NEXT: v_add_f32_e32 v0, s5, v0 @@ -7996,32 +7995,32 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: .LBB29_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_ctz_i32_b32 s1, s0 +; GFX11-NEXT: s_ctz_i32_b32 s3, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readlane_b32 s6, v2, s1 -; GFX11-NEXT: s_lshl_b32 s7, 1, s1 +; GFX11-NEXT: v_readlane_b32 s6, v2, s3 +; GFX11-NEXT: s_lshl_b32 s7, 1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 -; GFX11-NEXT: v_writelane_b32 v0, s5, s1 +; GFX11-NEXT: s_and_not1_b32 s2, s2, s7 +; GFX11-NEXT: v_writelane_b32 v0, s5, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: ; implicit-def: $vgpr2 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execz .LBB29_8 ; GFX11-NEXT: ; %bb.7: ; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX11-NEXT: .LBB29_8: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8034,37 +8033,37 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: local_ds_fadd_one_as: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s1, s5, 4 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_add_i32 s3, s5, 4 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB29_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s1, 3 +; GFX10-NEXT: s_lshl_b32 s5, s3, 3 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX10-NEXT: .LBB29_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_mov_b32 s7, exec_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 -; GFX10-NEXT: s_and_saveexec_b32 s6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2 +; GFX10-NEXT: s_and_saveexec_b32 s6, s2 ; GFX10-NEXT: s_cbranch_execz .LBB29_4 ; GFX10-NEXT: ; %bb.3: -; GFX10-NEXT: s_bcnt1_i32_b32 s0, s7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s1, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_bcnt1_i32_b32 s2, s7 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX10-NEXT: s_lshl_b32 s2, s3, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_f32 v2, v1 ; GFX10-NEXT: .LBB29_4: @@ -8072,36 +8071,36 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX10-NEXT: s_mov_b32 s0, exec_lo +; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, s5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s5, vcc_lo ; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: .LBB29_5: ; %ComputeLoop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_ff1_i32_b32 s1, s0 +; GFX10-NEXT: s_ff1_i32_b32 s3, s2 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-NEXT: v_readlane_b32 s6, v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s7 -; GFX10-NEXT: v_writelane_b32 v0, s5, s1 +; GFX10-NEXT: v_readlane_b32 s6, v2, s3 +; GFX10-NEXT: s_lshl_b32 s7, 1, s3 +; GFX10-NEXT: s_andn2_b32 s2, s2, s7 +; GFX10-NEXT: v_writelane_b32 v0, s5, s3 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: ; implicit-def: $vgpr2 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execz .LBB29_8 ; GFX10-NEXT: ; %bb.7: ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX10-NEXT: .LBB29_8: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -8113,10 +8112,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX90A-LABEL: local_ds_fadd_one_as: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s5, s5, 4 @@ -8124,9 +8123,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB29_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: s_lshl_b32 s8, s5, 3 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -8137,15 +8136,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX90A-NEXT: s_cbranch_execz .LBB29_4 ; GFX90A-NEXT: ; %bb.3: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX90A-NEXT: s_lshl_b32 s0, s5, 4 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX90A-NEXT: s_lshl_b32 s2, s5, 4 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f32 v2, v1 ; GFX90A-NEXT: .LBB29_4: ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] @@ -8153,20 +8152,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX90A-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s10 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_mov_b64 s[2:3], exec ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX90A-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 ; GFX90A-NEXT: v_readlane_b32 s9, v2, s5 ; GFX90A-NEXT: s_mov_b32 m0, s5 -; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd @@ -8174,15 +8173,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX90A-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execz .LBB29_8 ; GFX90A-NEXT: ; %bb.7: ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX90A-NEXT: .LBB29_8: -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 ; GFX90A-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8194,10 +8193,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX908-LABEL: local_ds_fadd_one_as: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s5, s5, 4 @@ -8205,9 +8204,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_cbranch_execz .LBB29_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX908-NEXT: s_lshl_b32 s8, s5, 3 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -8218,15 +8217,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_readfirstlane_b32 s10, v1 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX908-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_cbranch_execz .LBB29_4 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX908-NEXT: s_lshl_b32 s0, s5, 4 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX908-NEXT: s_lshl_b32 s2, s5, 4 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, s0 +; GFX908-NEXT: v_mov_b32_e32 v2, s2 ; GFX908-NEXT: ds_add_f32 v2, v1 ; GFX908-NEXT: .LBB29_4: ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] @@ -8234,20 +8233,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX908-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX908-NEXT: v_mov_b32_e32 v1, s10 -; GFX908-NEXT: s_mov_b64 s[0:1], exec +; GFX908-NEXT: s_mov_b64 s[2:3], exec ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX908-NEXT: ; implicit-def: $vgpr0 ; GFX908-NEXT: .LBB29_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX908-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX908-NEXT: v_readfirstlane_b32 s8, v1 ; GFX908-NEXT: v_readlane_b32 s9, v2, s5 ; GFX908-NEXT: s_mov_b32 m0, s5 -; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX908-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX908-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd @@ -8255,15 +8254,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX908-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX908-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX908-NEXT: s_cbranch_execz .LBB29_8 ; GFX908-NEXT: ; %bb.7: ; GFX908-NEXT: v_mov_b32_e32 v2, s4 ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX908-NEXT: .LBB29_8: -; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s2, v2 ; GFX908-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8275,10 +8274,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: local_ds_fadd_one_as: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s5, s5, 4 @@ -8287,9 +8286,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_cbranch_execz .LBB29_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_lshl_b32 s8, s5, 3 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -8300,15 +8299,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_readfirstlane_b32 s10, v1 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB29_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX8-NEXT: s_lshl_b32 s0, s5, 4 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX8-NEXT: s_lshl_b32 s2, s5, 4 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: ds_add_f32 v2, v1 ; GFX8-NEXT: .LBB29_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] @@ -8316,20 +8315,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: .LBB29_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_readfirstlane_b32 s8, v1 ; GFX8-NEXT: v_readlane_b32 s9, v2, s5 ; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd @@ -8337,16 +8336,16 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr2 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB29_8 ; GFX8-NEXT: ; %bb.7: ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX8-NEXT: .LBB29_8: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8359,10 +8358,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: local_ds_fadd_one_as: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 -; GFX7-NEXT: s_mov_b64 s[0:1], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s5, s5, 4 @@ -8374,8 +8373,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_lshl_b32 s8, s5, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: ds_read_b32 v1, v2 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_2: ; %atomicrmw.start @@ -8385,8 +8384,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 -; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 +; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_2 ; GFX7-NEXT: ; %bb.3: ; %Flow18 @@ -8397,15 +8396,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_cbranch_execz .LBB29_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s0, s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshl_b32 s2, s5, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: ds_read_b32 v3, v1 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2 @@ -8414,8 +8413,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 @@ -8428,7 +8427,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_add_f32_e32 v2, s10, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: .LBB29_8: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8437,12 +8436,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX7-NEXT: s_cbranch_execnz .LBB29_8 ; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8451,10 +8450,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX6-LABEL: local_ds_fadd_one_as: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 -; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s5, s5, 4 @@ -8466,8 +8465,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_lshl_b32 s8, s5, 3 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ds_read_b32 v1, v2 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_2: ; %atomicrmw.start @@ -8477,8 +8476,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 -; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 +; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_2 ; GFX6-NEXT: ; %bb.3: ; %Flow16 @@ -8489,15 +8488,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB29_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s0, s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshl_b32 s2, s5, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: ds_read_b32 v3, v1 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2 @@ -8506,8 +8505,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 @@ -8520,7 +8519,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_add_f32_e32 v2, s10, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: .LBB29_8: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8529,12 +8528,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX6-NEXT: s_cbranch_execnz .LBB29_8 ; GFX6-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll index d068e2ae4ec97f..8386a685a1a120 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @local_memory(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 16, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_barrier ; GCN-NEXT: ds_read_b32 v0, v0 @@ -51,7 +51,7 @@ define amdgpu_kernel void @local_memory_two_objects(ptr addrspace(1) %out) #0 { ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 12, v1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_barrier ; SI-NEXT: v_sub_i32_e32 v2, vcc, 28, v1 @@ -73,7 +73,7 @@ define amdgpu_kernel void @local_memory_two_objects(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:3 offset1:7 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 49531e3b4f8f30..52f97150e4b301 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -19,20 +19,20 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-LABEL: local_stack_offset_uses_sp: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_add_u32_e32 v0, 64, v1 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0x2000 -; MUBUF-NEXT: s_mov_b32 s4, 0 +; MUBUF-NEXT: s_mov_b32 s6, 0 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: .LBB0_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 -; MUBUF-NEXT: v_add_u32_e32 v3, s4, v1 -; MUBUF-NEXT: s_add_i32 s4, s4, 1 -; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 +; MUBUF-NEXT: v_add_u32_e32 v3, s6, v1 +; MUBUF-NEXT: s_add_i32 s6, s6, 1 +; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120 ; MUBUF-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB0_1 @@ -47,7 +47,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; MUBUF-NEXT: v_mov_b32_e32 v6, 0 ; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 ; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc @@ -58,30 +58,30 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; ; FLATSCR-LABEL: local_stack_offset_uses_sp: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 +; FLATSCR-NEXT: scratch_store_dword off, v0, s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_mov_b32 s0, 0 +; FLATSCR-NEXT: s_mov_b32 s2, 0 ; FLATSCR-NEXT: .LBB0_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_i32 s1, s0, 0x3000 -; FLATSCR-NEXT: s_add_i32 s0, s0, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v0, s1 +; FLATSCR-NEXT: s_add_i32 s3, s2, 0x3000 +; FLATSCR-NEXT: s_add_i32 s2, s2, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v0, s3 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: s_addk_i32 s0, 0x3000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc +; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 +; FLATSCR-NEXT: s_addk_i32 s2, 0x3000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s2 offset:208 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0x3000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:64 glc +; FLATSCR-NEXT: s_movk_i32 s2, 0x3000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s2 offset:64 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v4, 0 ; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -201,19 +201,19 @@ entry: define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out) { ; MUBUF-LABEL: local_stack_offset_uses_sp_flat: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x2000 -; MUBUF-NEXT: s_mov_b32 s4, 0 +; MUBUF-NEXT: s_mov_b32 s6, 0 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: .LBB2_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 -; MUBUF-NEXT: v_add_u32_e32 v2, s4, v0 -; MUBUF-NEXT: s_add_i32 s4, s4, 1 -; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 +; MUBUF-NEXT: v_add_u32_e32 v2, s6, v0 +; MUBUF-NEXT: s_add_i32 s6, s6, 1 +; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120 ; MUBUF-NEXT: buffer_store_byte v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1 @@ -251,7 +251,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF-NEXT: v_mov_b32_e32 v12, 0x4000 ; MUBUF-NEXT: buffer_load_dword v3, v10, s[0:3], 0 offen offset:12 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; MUBUF-NEXT: buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 @@ -272,33 +272,33 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; ; FLATSCR-LABEL: local_stack_offset_uses_sp_flat: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024 +; FLATSCR-NEXT: s_mov_b32 s2, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, s2 offset:1024 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_i32 s1, s0, 0x2000 -; FLATSCR-NEXT: s_add_i32 s0, s0, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v0, s1 +; FLATSCR-NEXT: s_add_i32 s3, s2, 0x2000 +; FLATSCR-NEXT: s_add_i32 s2, s2, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v0, s3 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 -; FLATSCR-NEXT: s_addk_i32 s0, 0x2000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc +; FLATSCR-NEXT: s_movk_i32 s2, 0x1000 +; FLATSCR-NEXT: s_addk_i32 s2, 0x2000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s2 offset:720 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc +; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 offset:704 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:16 glc +; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s2 offset:16 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s0 glc +; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s2 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v12, 0 ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll index 7814eb603e5541..cc90d03e667157 100644 --- a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll +++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll @@ -14,9 +14,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_max_short_forward_branch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART @@ -26,10 +26,10 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_sleep 0 ; GCN-NEXT: .LBB0_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -55,9 +55,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.3: ; %bb0 ; GCN-NEXT: s_getpc_b64 s[8:9] @@ -73,10 +73,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -102,9 +102,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0 +; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s2, 0 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_cbranch_vccz .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %bb0 @@ -122,10 +122,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -150,7 +150,7 @@ bb3: define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: min_long_forward_vbranch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -254,28 +254,28 @@ bb3: define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { ; GCN-LABEL: uniform_unconditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 -; GCN-NEXT: s_mov_b64 s[0:1], -1 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc0 .LBB5_1 ; GCN-NEXT: ; %bb.7: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: .Lpost_getpc5: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_4-.Lpost_getpc5)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_4-.Lpost_getpc5)>>32 -; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_4-.Lpost_getpc5)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_4-.Lpost_getpc5)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] ; GCN-NEXT: .LBB5_1: ; %Flow -; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_cbranch_vccnz .LBB5_3 ; GCN-NEXT: .LBB5_2: ; %bb2 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 17 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB5_3: ; %bb4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -294,17 +294,17 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: s_mov_b64 vcc, exec ; GCN-NEXT: s_cbranch_execnz .LBB5_5 ; GCN-NEXT: ; %bb.9: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: .Lpost_getpc6: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_2-.Lpost_getpc6)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_2-.Lpost_getpc6)>>32 -; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_2-.Lpost_getpc6)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_2-.Lpost_getpc6)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] ; GCN-NEXT: .LBB5_5: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: .Lpost_getpc4: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_3-.Lpost_getpc4)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_3-.Lpost_getpc4)>>32 -; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_3-.Lpost_getpc4)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_3-.Lpost_getpc4)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 390d1d70ff2aae..f19eeee1ca7411 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -4,12 +4,12 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_flat: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GCN-NEXT: .LBB0_2: ; %for.body @@ -50,12 +50,12 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_global: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 @@ -96,12 +96,12 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_constant: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB2_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: .LBB2_2: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -143,7 +143,7 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_local: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll index 5484ba1ed2fe08..df3b2135e72ac1 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -32,7 +32,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 { ; ; GCN-LABEL: break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: undef_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -207,7 +207,7 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: constexpr_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -297,7 +297,7 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: true_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -386,7 +386,7 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: false_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -479,7 +479,7 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: invert_true_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll index 7998d430d5f907..cb3ea2e812770c 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -24,7 +24,7 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce ; GCN-NEXT: ds_write_b8 v0, v1 ; GCN-NEXT: ds_read_u8 v2, v0 offset:2 ; GCN-NEXT: ds_read_u16 v3, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b8 v0, v2 offset:6 ; GCN-NEXT: ds_write_b16 v0, v3 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll index 00dcff093c7db2..c6a734a065ff15 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -26,18 +26,18 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2(ptr addrspace(1) %arg, i ; ; GCN-LABEL: no_clobber_ds_load_stores_x2: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 2 ; GCN-NEXT: ds_write_b32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_write_b32 v1, v2 offset:256 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:256 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: global_store_dword v1, v0, s[0:1] @@ -74,21 +74,21 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i ; ; GCN-LABEL: no_clobber_ds_load_stores_x3: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 2 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: ds_write_b32 v1, v2 offset:256 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 ; GCN-NEXT: v_mov_b32_e32 v2, 3 ; GCN-NEXT: ds_write_b32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_write_b32 v1, v2 offset:512 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v3, v0 offset:256 ; GCN-NEXT: ds_read_b32 v0, v0 offset:512 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index 9bbcc6988e311f..00d01a080ad14a 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -161,29 +161,24 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f0@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f0@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f1@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f1@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm call void @f0() call void @f1() @@ -200,36 +195,28 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s15, 1 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm - - call void @f2() call void @f3() ret void @@ -250,35 +237,30 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f1@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f1@gotpcrel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 -; GCN-NEXT: s_mov_b32 s15, 0 +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v1, v0 offset:16 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f2@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f2@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: ds_write_b8 v0, v1 offset:16 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_mov_b32 s15, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm call void @f1() %ld = load i8, ptr addrspace(3) @v3 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index 72a0aceaae12b6..d3cc60c501fd79 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -226,37 +226,29 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s15, 0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm - - call void @f0() call void @f1() ret void @@ -273,36 +265,28 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s15, 2 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm - - call void @f2() call void @f3() ret void @@ -323,41 +307,33 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s15, 1 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v1, v0 offset:2 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: ds_write_b8 v0, v1 offset:2 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_endpgm - +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @f1() %ld = load i8, ptr addrspace(3) @v3 %mul = mul i8 %ld, 8 diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index fef1b57db5685d..1429251fc64211 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -1,30 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @workgroup_ids_kernel() { -; GFX9-SDAG-LABEL: workgroup_ids_kernel: -; GFX9-SDAG: ; %bb.0: ; %.entry -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: workgroup_ids_kernel: -; GFX9-GISEL: ; %bb.0: ; %.entry -; GFX9-GISEL-NEXT: s_mov_b32 s0, s6 -; GFX9-GISEL-NEXT: s_mov_b32 s1, s7 -; GFX9-GISEL-NEXT: s_mov_b32 s2, s8 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: workgroup_ids_kernel: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel: ; GFX9ARCH-SDAG: ; %bb.0: ; %.entry @@ -83,27 +72,20 @@ define amdgpu_kernel void @workgroup_ids_kernel() { define amdgpu_kernel void @caller() { ; GFX9-SDAG-LABEL: caller: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-SDAG-NEXT: s_mov_b32 s38, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-SDAG-NEXT: s_add_u32 s36, s36, s9 -; GFX9-SDAG-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-SDAG-NEXT: s_add_u32 s8, s2, 36 -; GFX9-SDAG-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-SDAG-NEXT: s_getpc_b64 s[2:3] -; GFX9-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 -; GFX9-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 -; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x0 -; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-SDAG-NEXT: s_add_u32 flat_scratch_lo, s10, s13 +; GFX9-SDAG-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s13 +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-SDAG-NEXT: s_getpc_b64 s[8:9] +; GFX9-SDAG-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9-SDAG-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-SDAG-NEXT: s_mov_b32 s12, s6 -; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[14:15] @@ -111,27 +93,20 @@ define amdgpu_kernel void @caller() { ; ; GFX9-GISEL-LABEL: caller: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-GISEL-NEXT: s_mov_b32 s38, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-GISEL-NEXT: s_add_u32 s36, s36, s9 -; GFX9-GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-GISEL-NEXT: s_add_u32 s8, s2, 36 -; GFX9-GISEL-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-GISEL-NEXT: s_getpc_b64 s[0:1] -; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 -; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0 +; GFX9-GISEL-NEXT: s_add_u32 flat_scratch_lo, s10, s13 +; GFX9-GISEL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s13 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-GISEL-NEXT: s_getpc_b64 s[8:9] +; GFX9-GISEL-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9-GISEL-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-GISEL-NEXT: s_mov_b32 s12, s6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[14:15] @@ -139,81 +114,61 @@ define amdgpu_kernel void @caller() { ; ; GFX9ARCH-SDAG-LABEL: caller: ; GFX9ARCH-SDAG: ; %bb.0: -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s38, -1 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9ARCH-SDAG-NEXT: s_add_u32 s36, s36, s6 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s37, s37, 0 -; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s2, 36 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s3, 0 -; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[2:3] -; GFX9ARCH-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 -; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9ARCH-SDAG-NEXT: s_add_u32 flat_scratch_lo, s10, s12 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s0, s0, s12 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[8:9] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9ARCH-SDAG-NEXT: s_endpgm ; ; GFX9ARCH-GISEL-LABEL: caller: ; GFX9ARCH-GISEL: ; %bb.0: -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s38, -1 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9ARCH-GISEL-NEXT: s_add_u32 s36, s36, s6 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s2, 36 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s3, 0 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] -; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 -; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9ARCH-GISEL-NEXT: s_add_u32 flat_scratch_lo, s10, s12 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, s12 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[8:9] +; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9ARCH-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: caller: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 -; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-SDAG-NEXT: s_mov_b32 s7, callee@abs32@hi -; GFX12-SDAG-NEXT: s_mov_b32 s6, callee@abs32@lo -; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: caller: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 -; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-GISEL-NEXT: s_mov_b32 s6, callee@abs32@lo -; GFX12-GISEL-NEXT: s_mov_b32 s7, callee@abs32@hi -; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: caller: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_add_co_u32 s4, s4, callee@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, callee@gotpcrel32@hi+16 +; GFX12-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-NEXT: s_mov_b32 s32, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() call void @callee(i32 %idx) #0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll index 2963e7b765a0d1..7830bfc6ac7f59 100644 --- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @zext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -24,8 +24,8 @@ define amdgpu_kernel void @zext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 define amdgpu_kernel void @sext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -45,8 +45,8 @@ define amdgpu_kernel void @sext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 define amdgpu_kernel void @zext_shl64_overflow(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -67,8 +67,8 @@ define amdgpu_kernel void @zext_shl64_overflow(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @sext_shl64_overflow(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -89,7 +89,7 @@ define amdgpu_kernel void @sext_shl64_overflow(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: mulu24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_and_b32_e32 v0, 6, v0 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -112,7 +112,7 @@ bb: define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1) { ; GCN-LABEL: muli24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index e8ac1b2887c36e..994ef22539a65f 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; VI-LABEL: s_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s2, 0xffff ; VI-NEXT: s_lshr_b32 s2, s2, 16 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; CI-LABEL: s_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -54,7 +54,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX10-LABEL: s_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2 @@ -63,7 +63,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX11-LABEL: s_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 @@ -79,7 +79,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -90,7 +90,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -109,7 +109,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -142,9 +142,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -169,20 +167,20 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -203,22 +201,22 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, s0, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -226,10 +224,9 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: lshr_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -239,12 +236,9 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: lshr_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -266,20 +260,20 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -300,22 +294,22 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_and_b32 s0, s0, 0xffff -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_and_b32 s1, s8, 0xffff +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshr_b32_e32 v3, s1, v3 -; CI-NEXT: v_lshr_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshr_b32_e32 v3, s0, v3 +; CI-NEXT: v_lshr_b32_e32 v2, s1, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -323,10 +317,9 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: lshr_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -336,12 +329,9 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: lshr_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -363,7 +353,7 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -374,7 +364,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -394,7 +384,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -414,7 +404,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -425,9 +415,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -450,7 +438,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -461,7 +449,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -480,7 +468,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -497,7 +485,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -508,9 +496,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -533,7 +519,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -545,7 +531,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -567,7 +553,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -596,7 +582,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -608,9 +594,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_lshr_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -636,7 +620,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -648,7 +632,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -670,7 +654,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -689,7 +673,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -701,9 +685,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll index 3032b1028dc2d2..995c8c8679397c 100644 --- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @mad_u16( ; GFX8-LABEL: mad_u16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -35,7 +35,7 @@ define amdgpu_kernel void @mad_u16( ; ; GFX9-LABEL: mad_u16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -51,7 +51,7 @@ define amdgpu_kernel void @mad_u16( ; ; GFX10-LABEL: mad_u16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -67,10 +67,8 @@ define amdgpu_kernel void @mad_u16( ; ; GFX11-LABEL: mad_u16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll index e876a8d9dda692..620566d3baff38 100644 --- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 ; GCN-LABEL: {{^}}get_global_id_0: ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff -; GCN: s_mul_i32 [[MUL:s[0-9]+]], s10, [[WGSIZEX]] +; GCN: s_mul_i32 [[MUL:s[0-9]+]], s8, [[WGSIZEX]] ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 { %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 8eb0a46cc8b17f..400298bcff4f97 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -908,8 +908,8 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 { define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; CI-LABEL: mad_i64_i32_uniform: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -924,33 +924,33 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; ; SI-LABEL: mad_i64_i32_uniform: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mul_hi_u32 v1, s6, v0 -; SI-NEXT: s_mul_i32 s2, s6, s7 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mul_i32 s4, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX9-LABEL: mad_i64_i32_uniform: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s2, s6, s7 -; GFX9-NEXT: s_add_u32 s0, s3, s0 -; GFX9-NEXT: s_addc_u32 s1, s2, s1 +; GFX9-NEXT: s_mul_i32 s0, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s1, s6, s7 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -958,8 +958,8 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX11-LABEL: mad_i64_i32_uniform: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s2, s6, s7 ; GFX11-NEXT: s_mul_hi_u32 s3, s6, s7 @@ -975,8 +975,8 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX12-LABEL: mad_i64_i32_uniform: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index b8b4d4440d5809..9ec37a5e14cdf9 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -15,18 +15,18 @@ declare float @llvm.fabs.f32(float) nounwind readnone define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_madak_f32 v2, v2, v3, 0x41200000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -34,8 +34,8 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX8-LABEL: madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -56,12 +56,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX9-LABEL: madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -70,13 +70,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-MAD-LABEL: madak_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -85,10 +85,8 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-MAD-LABEL: madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -105,13 +103,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX940-FMA-LABEL: madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -120,13 +117,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-FMA-LABEL: madak_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -135,10 +132,8 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-FMA-LABEL: madak_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -170,7 +165,7 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { ; GFX6-LABEL: madak_2_use_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -195,7 +190,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX8-LABEL: madak_2_use_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -225,7 +220,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX9-LABEL: madak_2_use_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -245,7 +240,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-MAD-LABEL: madak_2_use_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -264,9 +259,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX11-MAD-LABEL: madak_2_use_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -276,9 +269,9 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; GFX11-MAD-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-MAD-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_add_f32 v2, 0x41200000, v2 -; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 +; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: v_dual_add_f32 v1, 0x41200000, v1 :: v_dual_add_f32 v2, 0x41200000, v2 ; GFX11-MAD-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc @@ -289,8 +282,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX940-FMA-LABEL: madak_2_use_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) @@ -310,7 +302,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-FMA-LABEL: madak_2_use_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -329,9 +321,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX11-FMA-LABEL: madak_2_use_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -375,7 +365,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a) #0 { ; GFX6-LABEL: madak_m_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -391,7 +381,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX8-LABEL: madak_m_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -408,7 +398,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX9-LABEL: madak_m_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -419,7 +409,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX10-MAD-LABEL: madak_m_inline_imm_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] @@ -430,14 +420,13 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX11-MAD-LABEL: madak_m_inline_imm_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-MAD-NEXT: s_nop 0 @@ -446,8 +435,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX940-FMA-LABEL: madak_m_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] @@ -458,7 +446,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX10-FMA-LABEL: madak_m_inline_imm_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] @@ -469,9 +457,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX11-FMA-LABEL: madak_m_inline_imm_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] @@ -498,18 +484,18 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, v2, v3, 4.0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -517,8 +503,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX8-LABEL: madak_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -539,12 +525,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX9-LABEL: madak_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, v1, v2, 4.0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -553,13 +539,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX10-MAD-LABEL: madak_inline_imm_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, v2, 4.0 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -568,10 +554,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-MAD-LABEL: madak_inline_imm_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -588,13 +572,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX940-FMA-LABEL: madak_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -603,13 +586,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX10-FMA-LABEL: madak_inline_imm_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -618,10 +601,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-FMA-LABEL: madak_inline_imm_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -651,26 +632,26 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, float %b) #0 { ; GFX6-LABEL: s_v_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 +; GFX6-NEXT: v_mac_f32_e32 v3, s8, v2 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_v_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -688,23 +669,22 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; ; GFX9-LABEL: s_v_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mac_f32_e32 v2, s0, v1 +; GFX9-NEXT: v_mac_f32_e32 v2, s2, v1 ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: s_v_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) @@ -715,15 +695,14 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-MAD-LABEL: s_v_madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-MAD-NEXT: s_nop 0 @@ -732,24 +711,22 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; ; GFX940-FMA-LABEL: s_v_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s0, v1 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s2, v1 ; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: s_v_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) @@ -760,10 +737,8 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-FMA-LABEL: s_v_madak_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] @@ -788,84 +763,82 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: v_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 +; GFX6-NEXT: v_mac_f32_e32 v3, s2, v2 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: v_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mac_f32_e32 v2, s2, v3 +; GFX8-NEXT: v_mac_f32_e32 v2, s0, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mac_f32_e32 v2, s4, v1 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: v_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-MAD-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-MAD-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, s4, v1, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: v_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] @@ -875,47 +848,44 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a ; ; GFX940-FMA-LABEL: v_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[0:1] -; GFX940-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX940-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s4, v1 -; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: v_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-FMA-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: v_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 +; GFX11-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -935,7 +905,7 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-LABEL: s_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x41200000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -949,7 +919,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX8-LABEL: s_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -961,7 +931,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX9-LABEL: s_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -972,7 +942,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX10-MAD-LABEL: s_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s3 @@ -982,7 +952,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX11-MAD-LABEL: s_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v0, s2, s3 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -994,7 +964,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX940-FMA-LABEL: s_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,7 +975,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX10-FMA-LABEL: s_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s3 @@ -1015,7 +985,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX11-FMA-LABEL: s_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1033,19 +1003,19 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src0_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 ; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, |v2|, v3, s0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -1053,8 +1023,8 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX8-LABEL: no_madak_src0_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1076,13 +1046,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src0_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, |v1|, v2, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1091,13 +1061,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX10-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, |v1|, v2, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -1106,10 +1076,8 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -1126,14 +1094,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX940-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, |v1|, v2, s0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -1142,13 +1109,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX10-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -1157,10 +1124,8 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1191,19 +1156,19 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src1_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 ; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, v2, |v3|, s0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -1211,8 +1176,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX8-LABEL: no_madak_src1_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1234,13 +1199,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src1_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, v1, |v2|, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1249,13 +1214,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX10-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, |v2|, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -1264,10 +1229,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -1284,14 +1247,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX940-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, v1, |v2|, s0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -1300,13 +1262,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX10-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -1315,10 +1277,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1352,36 +1312,36 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { ; GFX6-LABEL: madak_constant_bus_violation: ; GFX6: ; %bb.0: ; %bb -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX6-NEXT: ; %bb.1: ; %bb3 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: .LBB9_2: ; %bb4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x12 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x12 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mac_f32_e64 v1, s0, 0.5 ; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: madak_constant_bus_violation: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_lg_u32 s0, 0 +; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; %bb3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -1390,7 +1350,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX8-NEXT: .LBB9_2: ; %bb4 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1401,9 +1361,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX9-LABEL: madak_constant_bus_violation: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %bb3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1412,7 +1372,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX9-NEXT: .LBB9_2: ; %bb4 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1423,9 +1383,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-MAD-LABEL: madak_constant_bus_violation: ; GFX10-MAD: ; %bb.0: ; %bb -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-MAD-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-MAD-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX10-MAD-NEXT: ; %bb.1: ; %bb3 ; GFX10-MAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1434,7 +1394,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX10-MAD-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, s0, v1, 0x42280000 @@ -1445,9 +1405,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-MAD-LABEL: madak_constant_bus_violation: ; GFX11-MAD: ; %bb.0: ; %bb -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-MAD-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-MAD-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX11-MAD-NEXT: ; %bb.1: ; %bb3 ; GFX11-MAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1456,7 +1416,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX11-MAD-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x48 +; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x48 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v1, s0, 0.5 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1470,9 +1430,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX940-FMA-LABEL: madak_constant_bus_violation: ; GFX940-FMA: ; %bb.0: ; %bb -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: s_cmp_lg_u32 s0, 0 +; GFX940-FMA-NEXT: s_cmp_lg_u32 s2, 0 ; GFX940-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX940-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1481,7 +1441,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX940-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX940-FMA-NEXT: global_load_dword v0, v[0:1], off sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX940-FMA-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: v_fmac_f32_e64 v1, s0, 0.5 @@ -1492,9 +1452,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-FMA-LABEL: madak_constant_bus_violation: ; GFX10-FMA: ; %bb.0: ; %bb -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-FMA-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX10-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX10-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1503,7 +1463,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX10-FMA-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x42280000 @@ -1514,9 +1474,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-FMA-LABEL: madak_constant_bus_violation: ; GFX11-FMA: ; %bb.0: ; %bb -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-FMA-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX11-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1525,7 +1485,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX11-FMA-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x48 +; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x48 ; GFX11-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index 92536c2078514a..c7a831185b83c6 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX9-LABEL: test: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x1c -; GFX9-NEXT: s_load_dword s5, s[6:7], 0x38 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x1c +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x38 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_mul_i32 s10, s10, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s10 -; GFX9-NEXT: v_add_u32_e32 v0, s5, v0 +; GFX9-NEXT: s_and_b32 s4, s7, 0xffff +; GFX9-NEXT: s_mul_i32 s6, s6, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s6 +; GFX9-NEXT: v_add_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -34,13 +34,13 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX10-LABEL: test: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x1c -; GFX10-NEXT: s_load_dword s5, s[6:7], 0x38 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s7, s[4:5], 0x1c +; GFX10-NEXT: s_load_dword s8, s[4:5], 0x38 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s4, 0xffff -; GFX10-NEXT: s_mul_i32 s10, s10, s4 -; GFX10-NEXT: v_add3_u32 v0, s5, s10, v0 +; GFX10-NEXT: s_and_b32 s4, s7, 0xffff +; GFX10-NEXT: s_mul_i32 s6, s6, s4 +; GFX10-NEXT: v_add3_u32 v0, s8, s6, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 @@ -59,16 +59,14 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX11-LABEL: test: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x1c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x38 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x1c +; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x38 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s13, s13, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_add3_u32 v0, s5, s13, v0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s15, s15, s4 +; GFX11-NEXT: v_add3_u32 v0, s5, s15, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll index e929da796de6de..2b5d32fa7b9776 100644 --- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll +++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX10-LABEL: long_store_chain: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_mov_b32 s1, s0 @@ -91,7 +91,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; ; GFX11-LABEL: long_store_chain: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; ; GFX12-LABEL: long_store_chain: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s0 @@ -397,7 +397,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; GFX10-LABEL: long_load_chain: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3e ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -670,7 +670,7 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; ; GFX11-LABEL: long_load_chain: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -944,7 +944,7 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; ; GFX12-LABEL: long_load_chain: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1f ; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll index a8139cc6bc4c95..8ef2ca2765e8a1 100644 --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -28,12 +28,12 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imax_sge_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -54,8 +54,8 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -78,12 +78,12 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -104,8 +104,8 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -139,19 +139,19 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v0, s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4 +; GFX9-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -175,8 +175,8 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -202,12 +202,12 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 @@ -229,8 +229,8 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sgt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -251,12 +251,12 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imax_sgt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -277,8 +277,8 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_uge_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -299,12 +299,12 @@ define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umax_uge_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -325,8 +325,8 @@ define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_ugt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -347,12 +347,12 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umax_ugt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -372,8 +372,8 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_ugt_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -396,12 +396,12 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umax_ugt_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index 4fb90bbc46a8f5..bef9ff82aa396c 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -5,23 +5,23 @@ define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_i32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_max_i32_e32 v0, s6, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_i32: @@ -58,26 +58,26 @@ define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_i32_e32 v3, s7, v3 -; SI-NEXT: v_max_i32_e32 v2, s6, v2 -; SI-NEXT: v_max_i32_e32 v1, s5, v1 -; SI-NEXT: v_max_i32_e32 v0, s4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: v_max_i32_e32 v3, s11, v3 +; SI-NEXT: v_max_i32_e32 v2, s10, v2 +; SI-NEXT: v_max_i32_e32 v1, s9, v1 +; SI-NEXT: v_max_i32_e32 v0, s8, v0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_v4i32: @@ -116,7 +116,7 @@ define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_imax_sge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -146,8 +146,8 @@ define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_test_imax_sge_imm_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -175,24 +175,24 @@ define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_sbyte v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_i8: @@ -240,8 +240,8 @@ define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_test_imax_sgt_imm_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -269,7 +269,7 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: s_test_imax_sgt_imm_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -303,23 +303,23 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sgt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_i32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_max_i32_e32 v0, s6, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sgt_i32: @@ -355,7 +355,7 @@ define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_imax_sgt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -385,23 +385,23 @@ define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_uge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_u32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_max_u32_e32 v0, s6, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_uge_i32: @@ -437,7 +437,7 @@ define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_umax_uge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -467,20 +467,20 @@ define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32> %a, <3 x i32> %b) nounwind { ; SI-LABEL: s_test_umax_uge_v3i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_max_u32 s2, s6, s10 -; SI-NEXT: s_max_u32 s0, s5, s9 -; SI-NEXT: s_max_u32 s1, s4, s8 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8 +; SI-NEXT: s_max_u32 s6, s6, s10 +; SI-NEXT: s_max_u32 s5, s5, s9 +; SI-NEXT: s_max_u32 s4, s4, s8 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: s_test_umax_uge_v3i32: @@ -507,24 +507,24 @@ define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32 define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_uge_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_u32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_uge_i8: @@ -565,20 +565,20 @@ define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_ugt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dword s0, s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dword s4, s[4:5], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_u32_e32 v0, s0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_max_u32_e32 v0, s4, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_ugt_i32: @@ -614,7 +614,7 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_umax_ugt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -644,7 +644,7 @@ define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: s_test_umax_ugt_imm_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -680,9 +680,9 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind { ; SI-LABEL: simplify_demanded_bits_test_umax_ugt_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -727,9 +727,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspac define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind { ; SI-LABEL: simplify_demanded_bits_test_max_slt_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -773,9 +773,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind { ; SI-LABEL: s_test_imax_sge_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -826,8 +826,8 @@ define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_umax_ugt_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -868,8 +868,8 @@ define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_umax_uge_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -910,8 +910,8 @@ define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_imax_sgt_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -952,8 +952,8 @@ define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_imax_sge_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 0a76e169e9c385..ae1f31272a15f0 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 { ; CHECK-LABEL: memcpy_p0_p0_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 @@ -121,7 +121,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 { ; CHECK-LABEL: memcpy_p1_p1_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 @@ -145,7 +145,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p1_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] @@ -181,12 +181,12 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p5_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_add_u32 s8, s8, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 @@ -206,52 +206,52 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:14 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:13 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:11 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:10 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:9 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:8 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:7 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:6 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:5 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:2 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:1 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:31 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:30 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:4 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:3 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 @@ -262,229 +262,229 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:23 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:22 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:21 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:20 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:19 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:29 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:18 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:17 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:16 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:27 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:26 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:25 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:24 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:44 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:43 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:45 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:36 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:35 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:47 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:34 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:28 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:42 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:33 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:32 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:61 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:40 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:39 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:38 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:37 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:57 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:56 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:58 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:48 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:46 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:60 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:41 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:55 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:74 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:53 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:52 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:51 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:63 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:50 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:77 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:71 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:70 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:59 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:73 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:54 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:68 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:66 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:65 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:64 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:62 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:76 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:90 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:72 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:87 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:67 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:79 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:95 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:93 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:75 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:89 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:78 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:94 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:92 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:88 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:91 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:86 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:85 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:84 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:83 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:82 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 @@ -492,13 +492,13 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:81 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:80 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:111 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:108 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:100 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 @@ -506,54 +506,54 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:107 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:105 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:101 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:104 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 ; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:99 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:98 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:97 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:127 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:126 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:124 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:123 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:122 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:121 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:120 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:118 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:117 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:116 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:115 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:113 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -563,32 +563,32 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 { ; CHECK-LABEL: memcpy_p0_p5_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s13 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: s_add_u32 s8, s8, s7 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:15 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:14 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:13 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:11 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:10 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:9 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:7 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:6 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:5 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:3 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:2 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:1 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:31 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -601,287 +601,287 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:23 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:22 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:21 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:20 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:19 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:18 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:17 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:47 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:16 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:27 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:26 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:25 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:45 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:37 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:36 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:35 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:34 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:33 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:32 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:29 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:44 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:63 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:42 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:40 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:39 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:38 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:41 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:51 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:50 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:49 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:48 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:46 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:61 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:43 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:58 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:79 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:56 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:54 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:53 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:52 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:55 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:65 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:64 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:62 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:77 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:75 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:57 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:72 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:95 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:70 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:68 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:67 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:66 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:69 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:111 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:110 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:91 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:74 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:89 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:71 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:84 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:83 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:81 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:80 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:78 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:93 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:82 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:90 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:105 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:88 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:103 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:85 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:100 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:92 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:104 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:102 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:99 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:94 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:109 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:108 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:96 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:97 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:98 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:120 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:121 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:122 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:123 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:124 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 @@ -891,20 +891,20 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:126 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:116 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:117 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:118 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:119 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:127 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:114 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:115 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:125 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:113 +; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:112 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 @@ -935,7 +935,7 @@ entry: define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p3_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] @@ -971,7 +971,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-LABEL: memcpy_p0_p3_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 ; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 @@ -1254,7 +1254,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK-LABEL: memcpy_p0_p0_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 @@ -1367,7 +1367,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 { ; CHECK-LABEL: memcpy_p1_p1_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 @@ -1391,7 +1391,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p1_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] @@ -1427,12 +1427,12 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p5_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_add_u32 s8, s8, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 @@ -1452,52 +1452,52 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:14 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:13 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:11 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:10 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:9 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:8 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:7 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:6 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:5 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:2 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:1 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:31 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:30 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:4 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:3 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 @@ -1508,229 +1508,229 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:23 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:22 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:21 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:20 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:19 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:29 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:18 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:17 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:16 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:27 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:26 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:25 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:24 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:44 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:43 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:45 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:36 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:35 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:47 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:34 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:28 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:42 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:33 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:32 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:61 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:40 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:39 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:38 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:37 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:57 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:56 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:58 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:48 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:46 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:60 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:41 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:55 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:74 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:53 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:52 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:51 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:63 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:50 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:77 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:71 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:70 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:59 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:73 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:54 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:68 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:66 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:65 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:64 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:62 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:76 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:90 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:72 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:87 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:67 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:79 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:95 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:93 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:75 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:89 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:78 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:94 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:92 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:88 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:91 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:86 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:85 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:84 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:83 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:82 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 @@ -1738,13 +1738,13 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:81 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:80 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:111 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:108 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:100 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 @@ -1752,54 +1752,54 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:107 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:105 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:101 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:104 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 ; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:99 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:98 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:97 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:127 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:126 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:124 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:123 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:122 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:121 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:120 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:118 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:117 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:116 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:115 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:113 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -1809,32 +1809,32 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 { ; CHECK-LABEL: memcpy_p0_p5_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s13 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: s_add_u32 s8, s8, s7 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:15 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:14 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:13 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:11 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:10 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:9 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:7 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:6 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:5 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:3 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:2 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:1 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:31 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -1847,287 +1847,287 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:23 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:22 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:21 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:20 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:19 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:18 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:17 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:47 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:16 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:27 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:26 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:25 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:45 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:37 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:36 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:35 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:34 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:33 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:32 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:29 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:44 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:63 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:42 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:40 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:39 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:38 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:41 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:51 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:50 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:49 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:48 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:46 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:61 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:43 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:58 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:79 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:56 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:54 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:53 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:52 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:55 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:65 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:64 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:62 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:77 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:75 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:57 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:72 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:95 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:70 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:68 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:67 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:66 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:69 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:111 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:110 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:91 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:74 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:89 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:71 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:84 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:83 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:81 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:80 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:78 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:93 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:82 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:90 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:105 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:88 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:103 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:85 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:100 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:92 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:104 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:102 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:99 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:94 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:109 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:108 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:96 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:97 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:98 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:120 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:121 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:122 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:123 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:124 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 @@ -2137,20 +2137,20 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:126 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:116 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:117 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:118 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:119 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:127 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:114 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:115 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:125 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:113 +; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:112 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 @@ -2181,7 +2181,7 @@ entry: define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p3_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] @@ -2217,7 +2217,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-LABEL: memcpy_p0_p3_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 ; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll index 3a6d8ca1e35f60..f60728c16a3ae5 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll @@ -9,7 +9,7 @@ define void @memcpy_p1_p4_sz16_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz16_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -26,7 +26,7 @@ define void @memcpy_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz31_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s8 @@ -34,7 +34,7 @@ define void @memcpy_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-NEXT: v_mov_b32_e32 v4, s10 ; CHECK-NEXT: v_mov_b32_e32 v5, s11 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_load_dwordx4 v[2:5], v6, s[6:7] offset:15 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v6, s[4:5] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -47,7 +47,7 @@ define void @memcpy_p1_p4_sz32_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz32_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll index b32bfd0e495ba1..1b8483a54bb3bf 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll @@ -9,7 +9,7 @@ define void @memmove_p1_p4_sz16_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK-LABEL: memmove_p1_p4_sz16_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -27,8 +27,8 @@ define void @memmove_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: global_load_ubyte v9, v2, s[6:7] offset:30 -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; CHECK-NEXT: global_load_ubyte v9, v2, s[4:5] offset:30 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -53,7 +53,7 @@ define void @memmove_p1_p4_sz32_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK-LABEL: memmove_p1_p4_sz32_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s8 ; CHECK-NEXT: v_mov_b32_e32 v3, s9 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 3a065d518f0a9d..c49e0501665c57 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] @@ -24,7 +24,7 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture read ; ; GCN-SCRATCH-LABEL: vector_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x3 @@ -69,7 +69,7 @@ bb: define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: scalar_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -98,7 +98,7 @@ define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture read ; ; GCN-SCRATCH-LABEL: scalar_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -250,11 +250,11 @@ bb: define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture readnone %arg1, ptr addrspace(1) noalias nocapture %arg2) { ; GCN-LABEL: vector_clause_indirect: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[0:1] +; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off ; GCN-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 @@ -267,20 +267,20 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap ; ; GCN-SCRATCH-LABEL: vector_clause_indirect: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[0:1] +; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x1 ; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[4:5], off ; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GCN-SCRATCH-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -384,10 +384,10 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 ; GCN-NEXT: s_mov_b32 s19, 0xe00000 -; GCN-NEXT: s_add_u32 s16, s16, s9 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GCN-NEXT: s_add_u32 s16, s16, s3 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 @@ -411,13 +411,13 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; ; GCN-SCRATCH-LABEL: flat_scratch_load: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 -; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 +; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 -; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x44 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1 ; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8 @@ -453,22 +453,22 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32> %desc) { ; GCN-LABEL: flat_scratch_load_clause: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: s_mov_b32 s15, 0xe00000 -; GCN-NEXT: s_add_u32 s12, s12, s9 -; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xe00000 +; GCN-NEXT: s_add_u32 s4, s4, s3 +; GCN-NEXT: s_addc_u32 s5, s5, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 -; GCN-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 0x40d00000 -; GCN-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GCN-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: exp mrt0 v0, off, off, off done vm @@ -476,10 +476,10 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32 ; ; GCN-SCRATCH-LABEL: flat_scratch_load_clause: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 -; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 +; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40d00000 ; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off diff --git a/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll index 9c2b437a08f088..7bb09f6697b685 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll @@ -18,7 +18,7 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16: ; GCN: v_mfma_f32_32x32x2bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %a = bitcast i32 1 to <2 x i16> @@ -30,7 +30,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16: ; GCN: v_mfma_f32_16x16x2bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -40,7 +40,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16: ; GCN: v_mfma_f32_4x4x2bf16 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -50,7 +50,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16: ; GCN: v_mfma_f32_32x32x4bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -60,7 +60,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16: ; GCN: v_mfma_f32_16x16x8bf16 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -70,7 +70,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k: ; GCN: v_mfma_f32_32x32x4bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -80,7 +80,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k: ; GCN: v_mfma_f32_16x16x4bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -90,7 +90,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k: ; GCN: v_mfma_f32_4x4x4bf16_1k v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -100,7 +100,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k: ; GCN: v_mfma_f32_32x32x8bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -110,7 +110,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k: ; GCN: v_mfma_f32_16x16x16bf16_1k v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -120,7 +120,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64: ; GCN: v_mfma_f64_4x4x4f64 v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+:[0-9]+}} -define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg) { bb: %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double 1.0, double 1.0, double 128.0, i32 0, i32 0, i32 0) store double %mai.1, ptr addrspace(1) %arg @@ -129,7 +129,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64: ; GCN: v_mfma_f64_16x16x4f64 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x double>, ptr addrspace(1) %arg %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double 1.0, double 1.0, <4 x double> %in.1, i32 0, i32 0, i32 0) @@ -139,7 +139,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8: ; GCN: v_mfma_i32_32x32x8i8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 1, <16 x i32> %in.1, i32 0, i32 0, i32 0) @@ -149,12 +149,10 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8: ; GCN: v_mfma_i32_16x16x16i8 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 1, <4 x i32> %in.1, i32 0, i32 0, i32 0) store <4 x i32> %mai.1, ptr addrspace(1) %arg ret void } - -attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll index e0708a55f438bd..ba34c1bbe1d710 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll @@ -19,7 +19,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_agpr: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #2 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #1 { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -29,7 +29,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) #0 { bb: %acc = call i32 asm sideeffect "; def $0", "={a0}"() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -40,7 +40,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) #0 { bb: call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> undef) %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -63,7 +63,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #1 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #0 { bb: call void @foo() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -78,7 +78,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call_multi_bb: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #1 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #0 { bb1: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) @@ -106,6 +106,5 @@ bb: declare void @foo() -attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "amdgpu-no-agpr" } -attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } -attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" } +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } +attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll index b48152dad99ac3..59b13c02f92fb9 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll @@ -30,7 +30,7 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i3 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8: ; GCN: v_mfma_i32_16x16x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 0, i32 0, i32 0) @@ -40,7 +40,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_32x32x16i8: ; GCN: v_mfma_i32_32x32x16_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 0, i32 0, i32 0) @@ -50,7 +50,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32: ; GCN: v_mfma_f32_16x16x8_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> , <2 x float> , <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -60,7 +60,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32: ; GCN: v_mfma_f32_32x32x4_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> , <2 x float> , <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -70,7 +70,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_bf8: ; GCN: v_mfma_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -80,7 +80,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_fp8: ; GCN: v_mfma_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -90,7 +90,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_bf8: ; GCN: v_mfma_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -100,7 +100,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_fp8: ; GCN: v_mfma_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -110,7 +110,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_bf8: ; GCN: v_mfma_f32_32x32x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -120,7 +120,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_fp8: ; GCN: v_mfma_f32_32x32x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -130,7 +130,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_bf8: ; GCN: v_mfma_f32_32x32x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -140,7 +140,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_fp8: ; GCN: v_mfma_f32_32x32x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -150,7 +150,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_f16: ; GCN: v_smfmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -160,7 +160,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_f16: ; GCN: v_smfmac_f32_32x32x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -170,7 +170,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_bf16: ; GCN: v_smfmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -180,7 +180,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_bf16: ; GCN: v_smfmac_f32_32x32x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -190,7 +190,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_i8: ; GCN: v_smfmac_i32_16x16x64_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 0, i32 0) @@ -200,7 +200,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_i8: ; GCN: v_smfmac_i32_32x32x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 0, i32 0) @@ -210,7 +210,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_bf8: ; GCN: v_smfmac_f32_16x16x64_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -220,7 +220,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_fp8: ; GCN: v_smfmac_f32_16x16x64_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -230,7 +230,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_bf8: ; GCN: v_smfmac_f32_16x16x64_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -240,7 +240,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_fp8: ; GCN: v_smfmac_f32_16x16x64_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -250,7 +250,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_bf8: ; GCN: v_smfmac_f32_32x32x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -260,7 +260,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_fp8: ; GCN: v_smfmac_f32_32x32x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -270,7 +270,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_bf8: ; GCN: v_smfmac_f32_32x32x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -280,12 +280,10 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_fp8: ; GCN: v_smfmac_f32_32x32x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } - -attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll index bffd15872c42cb..06775f5d3f92b2 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll @@ -19,7 +19,7 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: ; GCN: v_mfma_f32_32x32x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -29,7 +29,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: ; GCN: v_mfma_f32_16x16x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -39,7 +39,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: ; GCN: v_mfma_f32_4x4x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -49,7 +49,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32: ; GCN: v_mfma_f32_32x32x2{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -59,7 +59,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32: ; GCN: v_mfma_f32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -69,7 +69,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16: ; GCN: v_mfma_f32_32x32x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg) { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> undef, <4 x half> undef, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -79,7 +79,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16: ; GCN: v_mfma_f32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> undef, <4 x half> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -89,7 +89,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16: ; GCN: v_mfma_f32_4x4x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> undef, <4 x half> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -99,7 +99,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16: ; GCN: v_mfma_f32_32x32x8{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> undef, <4 x half> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -109,7 +109,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16: ; GCN: v_mfma_f32_16x16x16{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> undef, <4 x half> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -119,7 +119,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8: ; GCN: v_mfma_i32_32x32x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) { bb: %in.1 = load <32 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 1, <32 x i32> %in.1, i32 0, i32 0, i32 0) @@ -129,7 +129,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8: ; GCN: v_mfma_i32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 1, <16 x i32> %in.1, i32 0, i32 0, i32 0) @@ -139,12 +139,10 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8: ; GCN: v_mfma_i32_4x4x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 1, <4 x i32> %in.1, i32 0, i32 0, i32 0) store <4 x i32> %mai.1, ptr addrspace(1) %arg ret void } - -attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index a77892c8f5fc7b..9dafa27ece86f6 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -31,8 +31,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -53,8 +53,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -75,12 +75,12 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -89,13 +89,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_sle_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -104,10 +104,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_sle_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -145,7 +143,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -156,7 +154,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -167,7 +165,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -177,7 +175,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_sle_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -187,7 +185,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_sle_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -217,7 +215,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_imin_sle_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -228,7 +226,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_imin_sle_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -239,7 +237,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_imin_sle_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -249,7 +247,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_imin_sle_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -259,7 +257,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_imin_sle_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -292,8 +290,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; CI-LABEL: s_test_imin_sle_v4i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s11, s15 ; CI-NEXT: s_min_i32 s3, s10, s14 @@ -310,8 +308,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; VI-LABEL: s_test_imin_sle_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s11, s15 ; VI-NEXT: s_min_i32 s3, s10, s14 @@ -328,8 +326,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; GFX9-LABEL: s_test_imin_sle_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s11, s15 @@ -346,8 +344,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; GFX10-LABEL: s_test_imin_sle_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s11, s15 @@ -364,8 +362,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; GFX11-LABEL: s_test_imin_sle_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s7, s11 @@ -419,9 +417,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; CI-LABEL: s_test_imin_sle_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 @@ -434,9 +432,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; VI-LABEL: s_test_imin_sle_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 @@ -449,9 +447,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; GFX9-LABEL: s_test_imin_sle_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -464,9 +462,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX10-LABEL: s_test_imin_sle_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i8 s2, s2 @@ -479,13 +477,13 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX11-LABEL: s_test_imin_sle_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i8 s2, s4 -; GFX11-NEXT: s_sext_i32_i8 s3, s5 +; GFX11-NEXT: s_sext_i32_i8 s2, s2 +; GFX11-NEXT: s_sext_i32_i8 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -556,9 +554,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; CI-LABEL: s_test_imin_sle_v4i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 24 ; CI-NEXT: s_sext_i32_i8 s5, s2 @@ -589,9 +587,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; VI-LABEL: s_test_imin_sle_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2 ; VI-NEXT: v_lshrrev_b16_e64 v1, 8, s3 @@ -618,9 +616,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX9-LABEL: s_test_imin_sle_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 @@ -646,9 +644,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX10-LABEL: s_test_imin_sle_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 @@ -675,27 +673,29 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX11-LABEL: s_test_imin_sle_v4i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x4c +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: v_ashrrev_i16 v0, 8, s0 -; GFX11-NEXT: v_ashrrev_i16 v1, 8, s1 +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: v_ashrrev_i16 v0, 8, s2 +; GFX11-NEXT: v_ashrrev_i16 v1, 8, s3 ; GFX11-NEXT: v_ashrrev_i16 v2, 8, s4 ; GFX11-NEXT: v_ashrrev_i16 v3, 8, s5 -; GFX11-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX11-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX11-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80000 ; GFX11-NEXT: s_bfe_i32 s4, s4, 0x80000 ; GFX11-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX11-NEXT: v_min_i16 v4, s0, s1 +; GFX11-NEXT: v_min_i16 v4, s2, s3 ; GFX11-NEXT: v_min_i16 v5, s4, s5 ; GFX11-NEXT: v_min_i16 v2, v2, v3 ; GFX11-NEXT: v_min_i16 v0, v0, v1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -707,7 +707,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -752,7 +751,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; CI-LABEL: s_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 16 ; CI-NEXT: s_sext_i32_i16 s2, s2 @@ -771,7 +770,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; VI-LABEL: s_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -790,7 +789,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -800,7 +799,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX10-LABEL: s_test_imin_sle_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s2, s3 @@ -809,7 +808,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX11-LABEL: s_test_imin_sle_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s2, s3 @@ -904,8 +903,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; CI-LABEL: s_test_imin_sle_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s6, s0, 16 ; CI-NEXT: s_ashr_i32 s7, s1, 16 @@ -934,8 +933,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; VI-LABEL: s_test_imin_sle_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s6, s1, 16 ; VI-NEXT: s_sext_i32_i16 s1, s1 @@ -964,34 +963,34 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_pk_min_i16 v1, s1, v0 ; GFX9-NEXT: v_pk_min_i16 v0, s0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_sle_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s1, s3 ; GFX10-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_sle_v4i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s5, s7 @@ -1031,8 +1030,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1053,8 +1052,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1075,12 +1074,12 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1089,13 +1088,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -1104,10 +1103,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1172,8 +1169,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1194,8 +1191,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1216,12 +1213,12 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] @@ -1230,13 +1227,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i16 v1, v1, v2 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] @@ -1245,10 +1242,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1287,7 +1282,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1298,7 +1293,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1309,7 +1304,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -1319,7 +1314,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_slt_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -1329,7 +1324,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_slt_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -1360,8 +1355,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; CI-LABEL: s_test_imin_slt_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 @@ -1374,8 +1369,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: s_test_imin_slt_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 @@ -1388,36 +1383,36 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; GFX9-LABEL: s_test_imin_slt_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s1, s1, s3 ; GFX9-NEXT: s_min_i32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_slt_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s0, s0, s2 ; GFX10-NEXT: s_min_i32 s1, s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_slt_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s4, s6 @@ -1448,8 +1443,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_slt_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1460,8 +1455,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_slt_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1472,8 +1467,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_slt_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1484,8 +1479,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_slt_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1496,11 +1491,11 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_slt_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s4, 8 +; GFX11-NEXT: s_min_i32 s2, s2, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1527,8 +1522,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_sle_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1539,8 +1534,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_sle_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1551,8 +1546,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_sle_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1563,8 +1558,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_sle_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1575,11 +1570,11 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_sle_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s4, 8 +; GFX11-NEXT: s_min_i32 s2, s2, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1617,8 +1612,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1639,8 +1634,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1661,12 +1656,12 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1675,13 +1670,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ule_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -1690,10 +1685,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ule_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1748,8 +1741,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1772,8 +1765,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1796,12 +1789,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] +; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v4 @@ -1812,13 +1805,13 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] -; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] +; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v2, v2, v5 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v4 @@ -1829,10 +1822,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1911,8 +1902,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1947,8 +1938,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1975,12 +1966,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_min_u16 v0, v0, v2 @@ -1991,13 +1982,13 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX10-NEXT: v_pk_min_u16 v0, v0, v2 @@ -2008,10 +1999,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2053,7 +2042,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2064,7 +2053,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2075,7 +2064,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2085,7 +2074,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ule_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2095,7 +2084,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ule_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2136,8 +2125,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2158,8 +2147,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2180,12 +2169,12 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -2194,13 +2183,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ult_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -2209,10 +2198,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ult_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2268,8 +2255,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; CI-LABEL: v_test_umin_ult_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 @@ -2289,8 +2276,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: v_test_umin_ult_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0 @@ -2310,11 +2297,11 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_test_umin_ult_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-NEXT: global_load_ubyte v2, v0, s[4:5] +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] @@ -2323,12 +2310,12 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX10-LABEL: v_test_umin_ult_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX10-NEXT: global_load_ubyte v2, v0, s[4:5] +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u16 v1, v1, v2 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] @@ -2337,9 +2324,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX11-LABEL: v_test_umin_ult_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u8 v1, v0, s[6:7] @@ -2377,7 +2363,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2388,7 +2374,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2399,7 +2385,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2409,7 +2395,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ult_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2419,7 +2405,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ult_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2471,7 +2457,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i32_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2492,7 +2478,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i32_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2513,7 +2499,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i32_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0 @@ -2531,7 +2517,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX10-LABEL: v_test_umin_ult_i32_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0 @@ -2549,7 +2535,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX11-LABEL: v_test_umin_ult_i32_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 @@ -2621,7 +2607,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i16_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2643,7 +2629,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i16_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2665,7 +2651,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i16_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] @@ -2680,7 +2666,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX10-LABEL: v_test_umin_ult_i16_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -2696,7 +2682,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX11-LABEL: v_test_umin_ult_i16_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2735,7 +2721,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_umin_ult_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2746,7 +2732,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_umin_ult_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2757,7 +2743,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2767,7 +2753,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_umin_ult_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2777,7 +2763,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_umin_ult_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2818,8 +2804,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; CI-LABEL: s_test_umin_ult_v8i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s4, s11, s19 ; CI-NEXT: s_min_u32 s5, s10, s18 @@ -2849,8 +2835,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; VI-LABEL: s_test_umin_ult_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s4, s11, s19 ; VI-NEXT: s_min_u32 s5, s10, s18 @@ -2880,8 +2866,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v8i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s4, s9, s17 @@ -2908,8 +2894,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX10-LABEL: s_test_umin_ult_v8i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s4, s9, s17 @@ -2935,8 +2921,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX11-LABEL: s_test_umin_ult_v8i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x20 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x20 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s7, s15 @@ -3109,8 +3095,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; CI-LABEL: s_test_umin_ult_v8i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s2, s8, 16 ; CI-NEXT: s_and_b32 s3, s8, 0xffff @@ -3155,8 +3141,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; VI-LABEL: s_test_umin_ult_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s11, 16 ; VI-NEXT: s_lshr_b32 s4, s10, 16 @@ -3201,8 +3187,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; GFX9-LABEL: s_test_umin_ult_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s15 @@ -3219,8 +3205,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; GFX10-LABEL: s_test_umin_ult_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v3, s11, s15 @@ -3233,8 +3219,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; GFX11-LABEL: s_test_umin_ult_v8i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_u16 v3, s7, s11 @@ -3277,9 +3263,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3292,9 +3278,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3307,9 +3293,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -3322,9 +3308,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff @@ -3337,13 +3323,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0xffff -; GFX11-NEXT: s_and_b32 s3, s5, 0xffff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3386,9 +3372,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; CI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 @@ -3401,9 +3387,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; VI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 @@ -3416,9 +3402,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -3431,9 +3417,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s2 @@ -3446,13 +3432,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 -; GFX11-NEXT: s_sext_i32_i16 s3, s5 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 +; GFX11-NEXT: s_sext_i32_i16 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3503,8 +3489,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; CI-LABEL: s_test_imin_sle_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s3, s2 ; CI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3517,8 +3503,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; VI-LABEL: s_test_imin_sle_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s3, s2 ; VI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3531,8 +3517,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; GFX9-LABEL: s_test_imin_sle_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s3, s2 @@ -3545,8 +3531,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX10-LABEL: s_test_imin_sle_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s3, s2 @@ -3559,14 +3545,14 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX11-LABEL: s_test_imin_sle_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 -; GFX11-NEXT: s_ashr_i32 s3, s4, 16 +; GFX11-NEXT: s_sext_i32_i16 s3, s2 +; GFX11-NEXT: s_ashr_i32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s2, s3 +; GFX11-NEXT: s_min_i32 s2, s3, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3599,8 +3585,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ult_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3617,8 +3603,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ult_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3635,16 +3621,16 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ult_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3653,14 +3639,14 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ult_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3669,8 +3655,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ult_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[6:7], s[0:1] @@ -3709,8 +3695,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ule_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3727,8 +3713,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ule_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3745,16 +3731,16 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ule_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3763,14 +3749,14 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ule_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_le_u64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3779,8 +3765,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ule_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s2, s[6:7], s[0:1] @@ -3819,8 +3805,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_slt_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3837,8 +3823,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_slt_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3855,16 +3841,16 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_slt_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3873,14 +3859,14 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_slt_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3889,8 +3875,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_slt_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[0:1] @@ -3929,8 +3915,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_sle_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3947,8 +3933,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_sle_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3965,16 +3951,16 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_sle_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3983,14 +3969,14 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_sle_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_le_i64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3999,8 +3985,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_sle_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s2, s[6:7], s[0:1] @@ -4062,8 +4048,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4093,8 +4079,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4117,12 +4103,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -4131,13 +4117,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_sle_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -4146,10 +4132,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_sle_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -4214,8 +4198,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_ule_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4244,8 +4228,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_ule_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4268,12 +4252,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_ule_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -4282,13 +4266,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_ule_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -4297,10 +4281,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_ule_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll index 27b71dd471a839..46036256780bad 100644 --- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll +++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll @@ -28,128 +28,96 @@ store i32 0, ptr addrspace(3) @used_by_kernel define amdgpu_kernel void @withcall() { ; GFX9-LABEL: withcall: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_mov_b32 s23, 0xe00000 -; GFX9-NEXT: s_add_u32 s20, s20, s9 -; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 36 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s8, s0, 36 +; GFX9-NEXT: s_addc_u32 s9, s1, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[14:15] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: ds_write_b32 v3, v3 offset:8 +; GFX9-NEXT: ds_write_b32 v0, v0 offset:8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: withcall: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s22, -1 -; GFX10-NEXT: s_mov_b32 s23, 0x31c16000 -; GFX10-NEXT: s_add_u32 s20, s20, s9 -; GFX10-NEXT: s_addc_u32 s21, s21, 0 -; GFX10-NEXT: s_mov_b32 s14, s8 -; GFX10-NEXT: s_add_u32 s8, s2, 36 -; GFX10-NEXT: s_addc_u32 s9, s3, 0 -; GFX10-NEXT: s_getpc_b64 s[2:3] -; GFX10-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX10-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX10-NEXT: s_mov_b32 s12, s6 -; GFX10-NEXT: s_mov_b32 s13, s7 -; GFX10-NEXT: s_mov_b64 s[2:3], s[22:23] +; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s14, -1 +; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-NEXT: s_add_u32 s12, s12, s3 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_add_u32 s8, s0, 36 +; GFX10-NEXT: s_addc_u32 s9, s1, 0 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX10-NEXT: s_mov_b64 s[2:3], s[14:15] ; GFX10-NEXT: s_mov_b32 s32, 0 -; GFX10-NEXT: ds_write_b32 v3, v3 offset:8 +; GFX10-NEXT: ds_write_b32 v0, v0 offset:8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: s_endpgm ; ; G_GFX9-LABEL: withcall: ; G_GFX9: ; %bb.0: -; G_GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; G_GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; G_GFX9-NEXT: s_mov_b32 s22, -1 -; G_GFX9-NEXT: s_mov_b32 s23, 0xe00000 -; G_GFX9-NEXT: s_add_u32 s20, s20, s9 -; G_GFX9-NEXT: s_addc_u32 s21, s21, 0 -; G_GFX9-NEXT: s_mov_b32 s14, s8 -; G_GFX9-NEXT: s_add_u32 s8, s2, 36 -; G_GFX9-NEXT: s_addc_u32 s9, s3, 0 -; G_GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] -; G_GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; G_GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; G_GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; G_GFX9-NEXT: s_mov_b32 s14, -1 +; G_GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; G_GFX9-NEXT: s_add_u32 s12, s12, s3 +; G_GFX9-NEXT: s_addc_u32 s13, s13, 0 +; G_GFX9-NEXT: s_add_u32 s8, s0, 36 +; G_GFX9-NEXT: s_addc_u32 s9, s1, 0 ; G_GFX9-NEXT: s_getpc_b64 s[0:1] ; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 ; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 -; G_GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; G_GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; G_GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; G_GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] -; G_GFX9-NEXT: v_mov_b32_e32 v3, 0 -; G_GFX9-NEXT: v_mov_b32_e32 v4, 8 -; G_GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; G_GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] -; G_GFX9-NEXT: s_mov_b32 s12, s6 -; G_GFX9-NEXT: s_mov_b32 s13, s7 +; G_GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; G_GFX9-NEXT: s_mov_b64 s[0:1], s[12:13] +; G_GFX9-NEXT: v_mov_b32_e32 v0, 0 +; G_GFX9-NEXT: v_mov_b32_e32 v1, 8 +; G_GFX9-NEXT: s_mov_b64 s[2:3], s[14:15] ; G_GFX9-NEXT: s_mov_b32 s32, 0 -; G_GFX9-NEXT: ds_write_b32 v4, v3 +; G_GFX9-NEXT: ds_write_b32 v1, v0 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; G_GFX9-NEXT: s_endpgm ; ; G_GFX10-LABEL: withcall: ; G_GFX10: ; %bb.0: -; G_GFX10-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; G_GFX10-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; G_GFX10-NEXT: s_mov_b32 s22, -1 -; G_GFX10-NEXT: s_mov_b32 s23, 0x31c16000 -; G_GFX10-NEXT: s_add_u32 s20, s20, s9 -; G_GFX10-NEXT: s_addc_u32 s21, s21, 0 -; G_GFX10-NEXT: s_mov_b32 s14, s8 -; G_GFX10-NEXT: s_add_u32 s8, s2, 36 -; G_GFX10-NEXT: s_addc_u32 s9, s3, 0 -; G_GFX10-NEXT: s_mov_b64 s[10:11], s[4:5] -; G_GFX10-NEXT: s_mov_b64 s[4:5], s[0:1] +; G_GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; G_GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; G_GFX10-NEXT: s_mov_b32 s14, -1 +; G_GFX10-NEXT: s_mov_b32 s15, 0x31c16000 +; G_GFX10-NEXT: s_add_u32 s12, s12, s3 +; G_GFX10-NEXT: s_addc_u32 s13, s13, 0 +; G_GFX10-NEXT: s_add_u32 s8, s0, 36 +; G_GFX10-NEXT: s_addc_u32 s9, s1, 0 ; G_GFX10-NEXT: s_getpc_b64 s[0:1] ; G_GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 ; G_GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 -; G_GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; G_GFX10-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; G_GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; G_GFX10-NEXT: v_mov_b32_e32 v3, 0 -; G_GFX10-NEXT: v_mov_b32_e32 v4, 8 -; G_GFX10-NEXT: s_mov_b64 s[0:1], s[20:21] -; G_GFX10-NEXT: s_mov_b64 s[2:3], s[22:23] -; G_GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 -; G_GFX10-NEXT: s_mov_b32 s12, s6 -; G_GFX10-NEXT: s_mov_b32 s13, s7 +; G_GFX10-NEXT: v_mov_b32_e32 v0, 0 +; G_GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, 8 +; G_GFX10-NEXT: s_mov_b64 s[0:1], s[12:13] +; G_GFX10-NEXT: s_mov_b64 s[2:3], s[14:15] ; G_GFX10-NEXT: s_mov_b32 s32, 0 -; G_GFX10-NEXT: ds_write_b32 v4, v3 +; G_GFX10-NEXT: ds_write_b32 v1, v0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; G_GFX10-NEXT: s_endpgm store i32 0, ptr addrspace(3) @used_by_both call void @nonkernel() diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll index 1c38f8ffc89edc..99120ab4a14249 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: add_reg_imm ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -27,9 +27,9 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) { define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: add_reg_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -50,9 +50,9 @@ define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_reg_imm ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -73,9 +73,9 @@ define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_imm_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -96,9 +96,9 @@ define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_reg_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_reg_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll index eb638da3904055..4332d9daeaaf5e 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -23,13 +23,13 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 ; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400 -; GCN-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN-NEXT: s_load_dword s2, s[0:1], 0xf ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -73,7 +73,7 @@ exit: define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32_noret: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -82,13 +82,13 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_3 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: buffer_load_dword v4, v[1:2], s[4:7], 0 addr64 offset:400 -; GCN-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN-NEXT: s_load_dword s2, s[0:1], 0xf ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: .LBB1_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll index 90a3d350e7416e..63688ebeab9d0b 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -23,10 +23,10 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %atomic -; GCN-NEXT: s_load_dword s0, s[2:3], 0xf +; GCN-NEXT: s_load_dword s0, s[0:1], 0xf ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -58,7 +58,7 @@ exit: define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32_noret: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -67,10 +67,10 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %atomic -; GCN-NEXT: s_load_dword s0, s[2:3], 0xf +; GCN-NEXT: s_load_dword s0, s[0:1], 0xf ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll index ece7e28c763fb1..9d6e0927b0dfd6 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll @@ -8,7 +8,7 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -45,7 +45,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-LABEL: ctlz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -87,7 +87,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -125,7 +125,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; ; GFX10-LABEL: ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -168,7 +168,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -205,7 +205,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-LABEL: cttz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -249,7 +249,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -287,7 +287,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; ; GFX10-LABEL: cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll index 4630b0d7ef50ba..1cd9afef13b5e2 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: exp_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -24,9 +24,9 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: exp_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -45,9 +45,9 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: log_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -65,9 +65,9 @@ define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: log_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -86,9 +86,9 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rcp_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -106,9 +106,9 @@ define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rcp_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -127,9 +127,9 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rsq_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -147,9 +147,9 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rsq_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -168,9 +168,9 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sqrt_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -188,9 +188,9 @@ define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sqrt_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll index 4aed9dc2fca6ca..4ba5f3abcb24b1 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -5,49 +5,49 @@ ; Test addressing modes when the scratch base is not a frame index. ; GCN-LABEL: {{^}}store_private_offset_i8: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i8() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_i16: -; GCN: buffer_store_short v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i16() #0 { store volatile i16 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_i32: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i32() #0 { store volatile i32 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_v2i32: -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_v2i32() #0 { store volatile <2 x i32> , ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_v4i32: -; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 +; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_v4i32() #0 { store volatile <4 x i32> , ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}load_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i8() #0 { %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i8: -; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @sextload_private_offset_i8(ptr addrspace(1) %out) #0 { %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %sextload = sext i8 %load to i32 @@ -56,7 +56,7 @@ define amdgpu_kernel void @sextload_private_offset_i8(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}zextload_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @zextload_private_offset_i8(ptr addrspace(1) %out) #0 { %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %zextload = zext i8 %load to i32 @@ -65,14 +65,14 @@ define amdgpu_kernel void @zextload_private_offset_i8(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}load_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i16() #0 { %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i16: -; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @sextload_private_offset_i16(ptr addrspace(1) %out) #0 { %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %sextload = sext i16 %load to i32 @@ -81,7 +81,7 @@ define amdgpu_kernel void @sextload_private_offset_i16(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}zextload_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @zextload_private_offset_i16(ptr addrspace(1) %out) #0 { %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %zextload = zext i16 %load to i32 @@ -90,28 +90,28 @@ define amdgpu_kernel void @zextload_private_offset_i16(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}load_private_offset_i32: -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i32() #0 { %load = load volatile i32, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}load_private_offset_v2i32: -; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_v2i32() #0 { %load = load volatile <2 x i32>, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}load_private_offset_v4i32: -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_v4i32() #0 { %load = load volatile <4 x i32>, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[12:15], 0 offset:4095 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:4095 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 4095 to ptr addrspace(5)) ret void @@ -119,7 +119,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[12:15], 0 offen{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 4096 to ptr addrspace(5)) ret void @@ -127,7 +127,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[12:15], 0 offen offset:1{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen offset:1{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 4097 to ptr addrspace(5)) ret void diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 0889f8ef6316ed..b4272049f36a4c 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_mul_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_mul_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -50,7 +50,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_mul_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -69,7 +69,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: test_mul_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -88,7 +88,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_mul_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -109,7 +109,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_mul_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -157,7 +157,7 @@ entry: define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -179,7 +179,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_mul_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -201,7 +201,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_mul_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -223,7 +223,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_mul_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -246,7 +246,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_mul_v4i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -271,7 +271,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_mul_v4i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -326,9 +326,9 @@ entry: define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: s_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -341,9 +341,9 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; ; VI-LABEL: s_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 +; VI-NEXT: s_load_dword s7, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -356,10 +356,10 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; ; GFX9-LABEL: s_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s7, s[2:3], 0x34 -; GFX9-NEXT: ; kill: killed $sgpr2_sgpr3 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 +; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 @@ -373,11 +373,11 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX10-LABEL: s_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mul_i32 s0, s0, s6 +; GFX10-NEXT: s_mul_i32 s0, s2, s6 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -386,8 +386,8 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX11-LABEL: s_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mul_i32 s0, s0, s6 @@ -401,8 +401,8 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX12-LABEL: s_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mul_i32 s0, s0, s6 @@ -433,98 +433,98 @@ entry: define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 ; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -548,8 +548,8 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX12-LABEL: v_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 ; GFX12-NEXT: s_mov_b32 s14, s10 @@ -603,8 +603,8 @@ entry: define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -617,11 +617,11 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s4, v0, 0 +; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -630,43 +630,43 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s5, s4, 0x50 -; GFX9-NEXT: s_mulk_i32 s4, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mul_hi_i32 s0, s2, 0x50 +; GFX9-NEXT: s_mulk_i32 s2, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_sext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX10-NEXT: s_mul_hi_i32 s3, s4, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 +; GFX10-NEXT: s_mul_hi_i32 s1, s2, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_sext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX11-NEXT: s_mul_hi_i32 s3, s4, 0x50 +; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 +; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -676,7 +676,7 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s3, s2, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -711,8 +711,8 @@ entry: define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -725,11 +725,11 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -738,43 +738,43 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s5, s4, 0x50 -; GFX9-NEXT: s_mulk_i32 s4, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mul_hi_u32 s0, s2, 0x50 +; GFX9-NEXT: s_mulk_i32 s2, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_zext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX10-NEXT: s_mul_hi_u32 s3, s4, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 +; GFX10-NEXT: s_mul_hi_u32 s1, s2, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_zext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX11-NEXT: s_mul_hi_u32 s3, s4, 0x50 +; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 +; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -784,7 +784,7 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 @@ -818,7 +818,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -838,7 +838,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -857,7 +857,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -877,7 +877,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: v_mul64_sext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -896,7 +896,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: v_mul64_sext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -965,7 +965,7 @@ entry: define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -985,7 +985,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1024,7 +1024,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: v_mul64_zext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1043,7 +1043,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: v_mul64_zext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1112,7 +1112,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_inline_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1131,7 +1131,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_mul64_sext_inline_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_mul64_sext_inline_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: v_mul64_sext_inline_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1187,7 +1187,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_mul64_sext_inline_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_mul64_sext_inline_imm: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1256,9 +1256,9 @@ entry: define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: s_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1269,9 +1269,9 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; ; VI-LABEL: s_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dword s5, s[2:3], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dword s5, s[0:1], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1282,41 +1282,40 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; ; GFX9-LABEL: s_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mul_i32 s0, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_mul_i32 s0, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s2, s2, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -1327,13 +1326,12 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX12-LABEL: s_mul_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mul_i32 s2, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s2, s2, s3 +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1360,7 +1358,7 @@ entry: define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1378,7 +1376,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1396,7 +1394,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1414,7 +1412,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX10-LABEL: v_mul_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1432,7 +1430,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX11-LABEL: v_mul_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1452,7 +1450,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: v_mul_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1498,9 +1496,9 @@ entry: define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind { ; SI-LABEL: s_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1512,9 +1510,9 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; VI-LABEL: s_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x70 -; VI-NEXT: s_load_dword s5, s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x70 +; VI-NEXT: s_load_dword s5, s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1526,42 +1524,42 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; GFX9-LABEL: s_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x70 -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x70 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mul_lo_u16_e32 v0, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mul_lo_u16_e32 v0, s3, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i1: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mul_lo_u16 v0, s4, s5 +; GFX10-NEXT: v_mul_lo_u16 v0, s2, s3 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_lo_u16 v0, s2, s3 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -1572,13 +1570,13 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX12-LABEL: s_mul_i1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -1622,7 +1620,7 @@ entry: define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1642,7 +1640,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: v_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1662,7 +1660,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX9-LABEL: v_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1682,7 +1680,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX10-LABEL: v_mul_i1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1703,7 +1701,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX11-LABEL: v_mul_i1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1727,7 +1725,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX12-LABEL: v_mul_i1: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1795,8 +1793,8 @@ entry: define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: s_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1815,8 +1813,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; VI-LABEL: s_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1833,8 +1831,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; GFX9-LABEL: s_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1854,18 +1852,18 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX10-LABEL: s_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s1, s6, s1 -; GFX10-NEXT: s_mul_hi_u32 s2, s6, s0 -; GFX10-NEXT: s_add_i32 s1, s2, s1 -; GFX10-NEXT: s_mul_i32 s2, s7, s0 -; GFX10-NEXT: s_mul_i32 s0, s6, s0 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_mul_i32 s0, s6, s3 +; GFX10-NEXT: s_mul_hi_u32 s1, s6, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_add_i32 s0, s1, s0 +; GFX10-NEXT: s_mul_i32 s1, s7, s2 +; GFX10-NEXT: s_mul_i32 s2, s6, s2 +; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_mov_b32 s1, s5 @@ -1875,8 +1873,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX11-LABEL: s_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s1, s6, s1 @@ -1898,8 +1896,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX12-LABEL: s_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 @@ -1934,21 +1932,21 @@ entry: define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; SI-LABEL: v_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v1, v2, v1 ; SI-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -1956,52 +1954,52 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; SI-NEXT: v_mul_lo_u32 v0, v2, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v4, v2, v1 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0 ; VI-NEXT: v_mul_lo_u32 v0, v3, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -2009,27 +2007,27 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -2037,14 +2035,14 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2074,8 +2072,8 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX12-LABEL: v_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, s10 @@ -2136,19 +2134,19 @@ entry: define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) { ; SI-LABEL: mul32_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mul_i32 s6, s0, s1 +; SI-NEXT: s_mul_i32 s6, s2, s3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc @@ -2171,19 +2169,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul32_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mul_i32 s6, s0, s1 +; VI-NEXT: s_mul_i32 s6, s2, s3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: .LBB15_3: ; %Flow -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %if @@ -2206,19 +2204,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul32_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_mul_i32 s6, s0, s1 +; GFX9-NEXT: s_mul_i32 s6, s2, s3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $sgpr6 ; GFX9-NEXT: .LBB15_3: ; %Flow -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX9-NEXT: ; %bb.4: ; %if @@ -2241,19 +2239,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: mul32_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_mul_i32 s5, s0, s1 +; GFX10-NEXT: s_mul_i32 s5, s2, s3 ; GFX10-NEXT: s_branch .LBB15_3 ; GFX10-NEXT: .LBB15_2: ; GFX10-NEXT: s_mov_b32 s4, -1 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: .LBB15_3: ; %Flow -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX10-NEXT: ; %bb.4: ; %if @@ -2276,19 +2274,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: mul32_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_mul_i32 s5, s0, s1 +; GFX11-NEXT: s_mul_i32 s5, s2, s3 ; GFX11-NEXT: s_branch .LBB15_3 ; GFX11-NEXT: .LBB15_2: ; GFX11-NEXT: s_mov_b32 s4, -1 ; GFX11-NEXT: ; implicit-def: $sgpr5 ; GFX11-NEXT: .LBB15_3: ; %Flow -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX11-NEXT: ; %bb.4: ; %if @@ -2313,19 +2311,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul32_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX12-NEXT: ; %bb.1: ; %else -; GFX12-NEXT: s_mul_i32 s5, s0, s1 +; GFX12-NEXT: s_mul_i32 s5, s2, s3 ; GFX12-NEXT: s_branch .LBB15_3 ; GFX12-NEXT: .LBB15_2: ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: ; implicit-def: $sgpr5 ; GFX12-NEXT: .LBB15_3: ; %Flow -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX12-NEXT: ; %bb.4: ; %if @@ -2405,7 +2403,7 @@ endif: define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; SI-LABEL: mul64_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -2440,7 +2438,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul64_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2472,7 +2470,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2508,7 +2506,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: mul64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2542,7 +2540,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: mul64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2577,7 +2575,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2670,9 +2668,9 @@ endif: define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { ; SI-LABEL: s_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x1f -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x1f +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2719,9 +2717,9 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; VI-LABEL: s_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2758,96 +2756,96 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s6 -; GFX9-NEXT: s_add_i32 s7, s12, s7 -; GFX9-NEXT: s_mul_i32 s12, s9, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s12 -; GFX9-NEXT: s_mul_i32 s12, s10, s5 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s11, s11, s4 -; GFX9-NEXT: s_mul_i32 s6, s8, s6 -; GFX9-NEXT: s_add_i32 s12, s12, s11 -; GFX9-NEXT: s_mul_i32 s10, s10, s4 -; GFX9-NEXT: s_add_u32 s10, s10, s6 -; GFX9-NEXT: s_addc_u32 s11, s12, s7 -; GFX9-NEXT: s_mul_i32 s14, s5, s8 -; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s13, s5, s8 +; GFX9-NEXT: s_mul_i32 s0, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10 +; GFX9-NEXT: s_mul_i32 s2, s14, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8 +; GFX9-NEXT: s_add_i32 s0, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s13, s10 +; GFX9-NEXT: s_add_i32 s2, s3, s2 +; GFX9-NEXT: s_mul_i32 s3, s15, s8 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_i32 s1, s12, s10 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_i32 s3, s14, s8 +; GFX9-NEXT: s_add_u32 s3, s3, s1 +; GFX9-NEXT: s_addc_u32 s2, s2, s0 +; GFX9-NEXT: s_mul_i32 s14, s9, s12 +; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s7, s4, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 -; GFX9-NEXT: s_add_u32 s7, s7, s14 -; GFX9-NEXT: s_addc_u32 s12, s12, 0 -; GFX9-NEXT: s_add_u32 s12, s13, s12 -; GFX9-NEXT: s_addc_u32 s13, 0, 0 -; GFX9-NEXT: s_mul_hi_u32 s14, s5, s9 -; GFX9-NEXT: s_mul_i32 s5, s5, s9 -; GFX9-NEXT: s_add_u32 s5, s5, s12 -; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: s_addc_u32 s9, s14, s13 -; GFX9-NEXT: s_add_u32 s10, s5, s10 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mov_b32 s5, s6 -; GFX9-NEXT: s_addc_u32 s9, s9, s11 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_mul_i32 s1, s8, s13 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 +; GFX9-NEXT: s_add_u32 s1, s1, s14 +; GFX9-NEXT: s_addc_u32 s10, s10, 0 +; GFX9-NEXT: s_add_u32 s10, s11, s10 +; GFX9-NEXT: s_addc_u32 s11, 0, 0 +; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 +; GFX9-NEXT: s_mul_i32 s9, s9, s13 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_addc_u32 s10, s14, s11 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_add_u32 s9, s9, s3 +; GFX9-NEXT: s_addc_u32 s10, s10, s2 +; GFX9-NEXT: s_mul_i32 s2, s8, s12 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s12, 0 -; GFX10-NEXT: s_mov_b32 s3, s12 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s13, s2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s8, s7 +; GFX10-NEXT: s_mul_i32 s3, s8, s7 ; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6 ; GFX10-NEXT: s_mul_i32 s14, s10, s5 ; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX10-NEXT: s_mul_i32 s13, s9, s6 +; GFX10-NEXT: s_mul_i32 s12, s9, s6 ; GFX10-NEXT: s_mul_i32 s11, s11, s4 -; GFX10-NEXT: s_add_i32 s2, s7, s2 +; GFX10-NEXT: s_add_i32 s3, s7, s3 ; GFX10-NEXT: s_add_i32 s7, s15, s14 ; GFX10-NEXT: s_mul_i32 s6, s8, s6 ; GFX10-NEXT: s_mul_i32 s10, s10, s4 -; GFX10-NEXT: s_add_i32 s2, s2, s13 +; GFX10-NEXT: s_add_i32 s3, s3, s12 ; GFX10-NEXT: s_add_i32 s7, s7, s11 ; GFX10-NEXT: s_mul_i32 s19, s5, s8 ; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 ; GFX10-NEXT: s_add_u32 s6, s10, s6 ; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX10-NEXT: s_addc_u32 s7, s7, s2 +; GFX10-NEXT: s_addc_u32 s7, s7, s3 ; GFX10-NEXT: s_mul_i32 s17, s4, s9 -; GFX10-NEXT: s_add_u32 s2, s19, s20 +; GFX10-NEXT: s_add_u32 s3, s19, s20 ; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9 ; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9 ; GFX10-NEXT: s_mul_i32 s5, s5, s9 ; GFX10-NEXT: s_addc_u32 s9, s18, 0 -; GFX10-NEXT: s_add_u32 s13, s17, s2 +; GFX10-NEXT: s_add_u32 s3, s17, s3 ; GFX10-NEXT: s_addc_u32 s10, s16, 0 -; GFX10-NEXT: s_mul_i32 s2, s4, s8 +; GFX10-NEXT: s_mul_i32 s12, s4, s8 ; GFX10-NEXT: s_add_u32 s4, s9, s10 ; GFX10-NEXT: s_addc_u32 s8, 0, 0 ; GFX10-NEXT: s_add_u32 s4, s5, s4 ; GFX10-NEXT: s_addc_u32 s5, s21, s8 ; GFX10-NEXT: s_add_u32 s4, s4, s6 ; GFX10-NEXT: s_addc_u32 s5, s5, s7 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] +; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -2860,46 +2858,46 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX11-LABEL: s_mul_i128: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x4c -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x7c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s12, 0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s3, s12 +; GFX11-NEXT: s_mov_b32 s13, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s8, s7 +; GFX11-NEXT: s_mul_i32 s3, s8, s7 ; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6 ; GFX11-NEXT: s_mul_i32 s14, s10, s5 ; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX11-NEXT: s_mul_i32 s13, s9, s6 +; GFX11-NEXT: s_mul_i32 s12, s9, s6 ; GFX11-NEXT: s_mul_i32 s11, s11, s4 -; GFX11-NEXT: s_add_i32 s2, s7, s2 +; GFX11-NEXT: s_add_i32 s3, s7, s3 ; GFX11-NEXT: s_add_i32 s7, s15, s14 ; GFX11-NEXT: s_mul_i32 s6, s8, s6 ; GFX11-NEXT: s_mul_i32 s10, s10, s4 -; GFX11-NEXT: s_add_i32 s2, s2, s13 +; GFX11-NEXT: s_add_i32 s3, s3, s12 ; GFX11-NEXT: s_add_i32 s7, s7, s11 ; GFX11-NEXT: s_mul_i32 s19, s5, s8 ; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8 ; GFX11-NEXT: s_add_u32 s6, s10, s6 ; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX11-NEXT: s_addc_u32 s7, s7, s2 +; GFX11-NEXT: s_addc_u32 s7, s7, s3 ; GFX11-NEXT: s_mul_i32 s17, s4, s9 -; GFX11-NEXT: s_add_u32 s2, s19, s20 +; GFX11-NEXT: s_add_u32 s3, s19, s20 ; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9 ; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9 ; GFX11-NEXT: s_mul_i32 s5, s5, s9 ; GFX11-NEXT: s_addc_u32 s9, s18, 0 -; GFX11-NEXT: s_add_u32 s13, s17, s2 +; GFX11-NEXT: s_add_u32 s3, s17, s3 ; GFX11-NEXT: s_addc_u32 s10, s16, 0 -; GFX11-NEXT: s_mul_i32 s2, s4, s8 +; GFX11-NEXT: s_mul_i32 s12, s4, s8 ; GFX11-NEXT: s_add_u32 s4, s9, s10 ; GFX11-NEXT: s_addc_u32 s8, 0, 0 ; GFX11-NEXT: s_add_u32 s4, s5, s4 ; GFX11-NEXT: s_addc_u32 s5, s21, s8 ; GFX11-NEXT: s_add_u32 s4, s4, s6 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] +; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 @@ -2913,40 +2911,40 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX12-LABEL: s_mul_i128: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x7c -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x4c -; GFX12-NEXT: s_mov_b32 s13, 0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b32 s15, s13 -; GFX12-NEXT: s_mov_b32 s3, s13 -; GFX12-NEXT: s_mov_b32 s17, s13 -; GFX12-NEXT: s_mov_b32 s19, s13 -; GFX12-NEXT: s_mov_b32 s24, s13 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x7c +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x4c +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s15, s3 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s17, s3 +; GFX12-NEXT: s_mov_b32 s19, s3 +; GFX12-NEXT: s_mov_b32 s24, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s4 +; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s14, s8 -; GFX12-NEXT: s_mov_b32 s2, s9 -; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[12:13] -; GFX12-NEXT: s_mul_u64 s[20:21], s[2:3], s[12:13] -; GFX12-NEXT: s_mov_b32 s12, s23 +; GFX12-NEXT: s_mov_b32 s12, s9 +; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[2:3] +; GFX12-NEXT: s_mul_u64 s[20:21], s[12:13], s[2:3] +; GFX12-NEXT: s_mov_b32 s2, s23 ; GFX12-NEXT: s_mov_b32 s16, s5 ; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11] -; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[12:13] +; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[2:3] ; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9] ; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17] -; GFX12-NEXT: s_mov_b32 s12, s11 -; GFX12-NEXT: s_mov_b32 s11, s13 +; GFX12-NEXT: s_mov_b32 s2, s11 +; GFX12-NEXT: s_mov_b32 s11, s3 ; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5] ; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11] -; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[16:17] +; GFX12-NEXT: s_mul_u64 s[12:13], s[12:13], s[16:17] ; GFX12-NEXT: s_mov_b32 s18, s7 +; GFX12-NEXT: s_mov_b32 s23, s3 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] ; GFX12-NEXT: s_mov_b32 s25, s6 -; GFX12-NEXT: s_add_nc_u64 s[6:7], s[12:13], s[18:19] -; GFX12-NEXT: s_mov_b32 s23, s13 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7] -; GFX12-NEXT: s_or_b64 s[8:9], s[22:23], s[24:25] +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[12:13], s[2:3] +; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25] ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 @@ -3013,7 +3011,7 @@ entry: define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { ; SI-LABEL: v_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 @@ -3062,7 +3060,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; VI-LABEL: v_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3102,7 +3100,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3133,7 +3131,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX10-LABEL: v_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3165,9 +3163,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX11-LABEL: v_mul_i128: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c ; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -3205,9 +3201,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX12-LABEL: v_mul_i128: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c ; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 842dc36e001545..357b851a8f56f1 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s2, s2, 0x180000 @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_smul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ entry: define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smulhi24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_smulhi24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -126,7 +126,7 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -274,26 +274,26 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 { ; SI-LABEL: test_smul24_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x13 +; SI-NEXT: s_load_dword s0, s[0:1], 0x1c +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 -; SI-NEXT: s_bfe_i32 s5, s5, 0x180000 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_mul_i32 s4, s5, s4 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_bfe_i32 s1, s2, 0x180000 +; SI-NEXT: s_bfe_i32 s0, s0, 0x180000 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mul_i32 s1, s0, s1 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dword s5, s[2:3], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dword s5, s[0:1], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -307,19 +307,19 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 ; ; GFX9-LABEL: test_smul24_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s5, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s6, s5, s4 -; GFX9-NEXT: s_mul_i32 s5, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 +; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s2, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i64: @@ -376,8 +376,8 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i64_square: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -390,8 +390,8 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_smul24_i64_square: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -403,17 +403,17 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_smul24_i64_square: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s5, s4, s4 -; GFX9-NEXT: s_mul_i32 s4, s4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s1, s0, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i64_square: @@ -463,33 +463,33 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 { ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dword s6, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s4, 8 -; SI-NEXT: s_lshl_b32 s7, s6, 8 -; SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_mul_i32 s5, s4, s6 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 -; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_lshl_b32 s1, s2, 8 +; SI-NEXT: s_lshl_b32 s3, s0, 8 +; SI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_mul_i32 s1, s0, s2 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dword s5, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s4, 8 -; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_lshl_b32 s3, s2, 8 +; VI-NEXT: s_lshl_b32 s5, s4, 8 ; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -504,23 +504,23 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; GFX9-LABEL: test_smul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; GFX9-NEXT: s_mul_hi_i32 s5, s4, s6 -; GFX9-NEXT: s_mul_i32 s4, s4, s6 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_lshl_b32 s1, s2, 8 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 +; GFX9-NEXT: s_lshl_b32 s1, s3, 8 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 +; GFX9-NEXT: s_mul_hi_i32 s1, s0, s2 +; GFX9-NEXT: s_mul_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i33: @@ -580,9 +580,9 @@ entry: define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_smulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dword s5, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dword s5, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -594,9 +594,9 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -608,20 +608,20 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; GFX9-LABEL: test_smulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; GFX9-NEXT: s_mul_hi_i32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_lshl_b32 s1, s2, 8 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 +; GFX9-NEXT: s_lshl_b32 s1, s3, 8 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 +; GFX9-NEXT: s_mul_hi_i32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smulhi24_i33: @@ -672,15 +672,15 @@ entry: define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { ; SI-LABEL: simplify_i24_crash: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %bb7 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB8_2: ; %bb11 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s2, s4, 0x180000 @@ -694,15 +694,15 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, ; ; VI-LABEL: simplify_i24_crash: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %bb7 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB8_2: ; %bb11 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,24 +716,24 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, ; ; GFX9-LABEL: simplify_i24_crash: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %bb7 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB8_2: ; %bb11 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s6, 0x180000 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x180000 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0x180000 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: simplify_i24_crash: diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 0c0bb830ba847b..3a16c88f32cc3e 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffffff @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_umul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -64,13 +64,13 @@ entry: define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16_sext: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s4, 16 -; SI-NEXT: s_mul_i32 s4, s4, s2 -; SI-NEXT: s_sext_i32_i16 s4, s4 +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_sext_i32_i16 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -78,8 +78,8 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i ; ; VI-LABEL: test_umul24_i16_sext: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,16 +92,16 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i ; ; GFX9-LABEL: test_umul24_i16_sext: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: s_mul_i32 s2, s2, s0 +; GFX9-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %mul = mul i16 %a, %b @@ -113,7 +113,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr_sext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -136,7 +136,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; VI-LABEL: test_umul24_i16_vgpr_sext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -158,7 +158,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: test_umul24_i16_vgpr_sext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -186,13 +186,13 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s4, 16 -; SI-NEXT: s_mul_i32 s4, s4, s2 -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -200,8 +200,8 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b ; ; VI-LABEL: test_umul24_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -214,16 +214,16 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b ; ; GFX9-LABEL: test_umul24_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: s_mul_i32 s2, s2, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %mul = mul i16 %a, %b @@ -235,7 +235,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -258,7 +258,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_umul24_i16_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -279,7 +279,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_umul24_i16_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -307,8 +307,8 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; SI-LABEL: test_umul24_i8_vgpr: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: v_mov_b32_e32 v4, 0 @@ -330,8 +330,8 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: test_umul24_i8_vgpr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 @@ -351,11 +351,11 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: test_umul24_i8_vgpr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v3, v1, s[0:1] +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -379,7 +379,7 @@ entry: define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi24_i32_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -392,7 +392,7 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_umulhi24_i32_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -405,7 +405,7 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -432,9 +432,9 @@ entry: define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umulhi24: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -447,9 +447,9 @@ define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: test_umulhi24: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 +; VI-NEXT: s_load_dword s7, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -462,18 +462,19 @@ define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX9-LABEL: test_umulhi24: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i64 %a, 16777215 @@ -489,9 +490,9 @@ entry: define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umul24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -508,9 +509,9 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b ; ; VI-LABEL: test_umul24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 +; VI-NEXT: s_load_dword s7, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -524,20 +525,21 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b ; ; GFX9-LABEL: test_umul24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s2, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s1, s0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i64 %a, 40 @@ -580,8 +582,8 @@ define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: test_umul24_i64_square: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -594,8 +596,8 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: test_umul24_i64_square: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -606,17 +608,17 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3 ; ; GFX9-LABEL: test_umul24_i64_square: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s4 -; GFX9-NEXT: s_mul_i32 s4, s4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i64 %a, 40 @@ -629,7 +631,7 @@ entry: define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi16_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffff @@ -645,7 +647,7 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_umulhi16_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -661,7 +663,7 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_umulhi16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -683,27 +685,27 @@ entry: define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dword s5, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s6, s4, 0xffffff -; SI-NEXT: s_and_b32 s7, s5, 0xffffff -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s4, v0 -; SI-NEXT: s_mul_i32 s6, s6, s7 +; SI-NEXT: s_and_b32 s1, s2, 0xffffff +; SI-NEXT: s_and_b32 s3, s0, 0xffffff +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 +; SI-NEXT: s_mul_i32 s1, s1, s3 ; SI-NEXT: v_and_b32_e32 v1, 1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,20 +718,20 @@ define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; GFX9-LABEL: test_umul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff -; GFX9-NEXT: s_mul_i32 s6, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_i32 s2, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i33 %a, 9 @@ -745,9 +747,9 @@ entry: define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dword s5, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dword s5, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -759,9 +761,9 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; VI-LABEL: test_umulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -773,18 +775,18 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; GFX9-LABEL: test_umulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i33 %a, 9 diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll index 727b607e7ded06..16de2c0c6de08c 100644 --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -163,7 +163,7 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: multi_if_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll index 296d484e247d6e..f6e3509eb029b1 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocapture %arg) #0 { ; GCN-LABEL: reduced_nested_loop_conditions: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: s_mov_b32 s2, 0 @@ -93,6 +93,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocap ; IR: bb23: ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) ; IR-NEXT: ret void +; bb: %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %my.tmp1 = getelementptr inbounds i64, ptr addrspace(3) %arg, i32 %my.tmp @@ -276,6 +277,7 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]]) ; IR-NEXT: store volatile i32 0, ptr addrspace(1) undef, align 4 ; IR-NEXT: ret void +; bb: %my.tmp1134 = load volatile i32, ptr addrspace(1) undef %my.tmp1235 = icmp slt i32 %my.tmp1134, 9 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index b84686139d0e2c..ba012b208c957a 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -120,61 +120,61 @@ bb.2: ; ASSUME1024: ; ScratchSize: 1040 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { -; DEFAULTSIZE-V5-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: -; DEFAULTSIZE-V5: ; %bb.0: ; %entry -; DEFAULTSIZE-V5-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 -; DEFAULTSIZE-V5-NEXT: s_add_u32 s0, s0, s15 -; DEFAULTSIZE-V5-NEXT: s_addc_u32 s1, s1, 0 -; DEFAULTSIZE-V5-NEXT: s_mov_b32 s33, 0 -; DEFAULTSIZE-V5-NEXT: s_movk_i32 s32, 0x1000 -; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0) -; DEFAULTSIZE-V5-NEXT: s_cmp_lg_u32 s4, 0 -; DEFAULTSIZE-V5-NEXT: s_cbranch_scc1 .LBB1_2 -; DEFAULTSIZE-V5-NEXT: ; %bb.1: ; %bb.0 -; DEFAULTSIZE-V5-NEXT: s_add_i32 s4, s32, 0x1000 -; DEFAULTSIZE-V5-NEXT: s_and_b32 s4, s4, 0xfffff000 -; DEFAULTSIZE-V5-NEXT: s_lshl_b32 s5, s5, 2 -; DEFAULTSIZE-V5-NEXT: s_mov_b32 s32, s4 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v1, 0 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s4 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v3, 1 -; DEFAULTSIZE-V5-NEXT: s_add_i32 s4, s4, s5 -; DEFAULTSIZE-V5-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; DEFAULTSIZE-V5-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s4 -; DEFAULTSIZE-V5-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; DEFAULTSIZE-V5-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0) -; DEFAULTSIZE-V5-NEXT: v_add_u32_e32 v0, v2, v0 -; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0) -; DEFAULTSIZE-V5-NEXT: global_store_dword v1, v0, s[4:5] -; DEFAULTSIZE-V5-NEXT: .LBB1_2: ; %bb.1 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v0, 0 -; DEFAULTSIZE-V5-NEXT: global_store_dword v[0:1], v0, off -; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0) -; DEFAULTSIZE-V5-NEXT: s_endpgm +; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: s_mov_b32 s33, 0 +; MUBUF-NEXT: s_movk_i32 s32, 0x1000 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: s_cmp_lg_u32 s6, 0 +; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2 +; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 +; MUBUF-NEXT: s_lshl_b32 s7, s7, 2 +; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: v_mov_b32_e32 v3, 1 +; MUBUF-NEXT: s_add_i32 s6, s6, s7 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] +; MUBUF-NEXT: .LBB1_2: ; %bb.1 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_endpgm ; ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 ; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: s_mov_b32 s32, 64 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0 +; FLATSCR-NEXT: s_cmp_lg_u32 s2, 0 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 -; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1000 +; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000 +; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2 -; FLATSCR-NEXT: s_mov_b32 s32, s0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 -; FLATSCR-NEXT: s_add_i32 s0, s0, s1 -; FLATSCR-NEXT: scratch_load_dword v2, off, s0 -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2 +; FLATSCR-NEXT: s_mov_b32 s32, s2 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 +; FLATSCR-NEXT: s_add_i32 s2, s2, s3 +; FLATSCR-NEXT: scratch_load_dword v2, off, s2 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -406,6 +406,3 @@ attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amd !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; ASSUME1024: {{.*}} -; DEFAULTSIZE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index 5c09d2bd61a399..9ab3eccd986a53 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -2104,7 +2104,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2115,7 +2115,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 1 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2128,7 +2128,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc @@ -2138,7 +2138,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS @@ -2154,7 +2154,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2165,7 +2165,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2178,7 +2178,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc @@ -2188,7 +2188,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS @@ -2204,7 +2204,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2215,7 +2215,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2228,7 +2228,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc @@ -2238,7 +2238,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS @@ -2254,7 +2254,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2267,7 +2267,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2280,7 +2280,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2292,7 +2292,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS @@ -2302,7 +2302,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2315,7 +2315,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2334,7 +2334,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2347,7 +2347,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2360,7 +2360,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2372,7 +2372,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS @@ -2382,7 +2382,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2395,7 +2395,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2414,7 +2414,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2427,7 +2427,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2440,7 +2440,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2452,7 +2452,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS @@ -2462,7 +2462,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2475,7 +2475,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2494,7 +2494,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2507,7 +2507,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2520,7 +2520,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2532,7 +2532,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS @@ -2542,7 +2542,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2555,7 +2555,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2574,7 +2574,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2585,7 +2585,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2598,7 +2598,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc @@ -2608,7 +2608,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS @@ -2624,7 +2624,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2637,7 +2637,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2650,7 +2650,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2662,7 +2662,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS @@ -2672,7 +2672,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2685,7 +2685,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2704,7 +2704,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2717,7 +2717,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2730,7 +2730,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x3000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2742,7 +2742,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS @@ -2752,7 +2752,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2765,7 +2765,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2784,7 +2784,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2797,7 +2797,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2810,7 +2810,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2822,7 +2822,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS @@ -2832,7 +2832,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2845,7 +2845,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2864,7 +2864,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2877,7 +2877,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2890,7 +2890,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2902,7 +2902,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS @@ -2912,7 +2912,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2925,7 +2925,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2944,7 +2944,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2957,7 +2957,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2970,7 +2970,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2982,7 +2982,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS @@ -2992,7 +2992,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -3005,7 +3005,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -3025,7 +3025,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -3037,7 +3037,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3050,7 +3050,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3062,7 +3062,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3074,7 +3074,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3087,7 +3087,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3100,7 +3100,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3120,7 +3120,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -3132,7 +3132,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3145,7 +3145,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3157,7 +3157,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3169,7 +3169,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3182,7 +3182,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3195,7 +3195,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3215,7 +3215,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -3227,7 +3227,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3240,7 +3240,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3252,7 +3252,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3264,7 +3264,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3277,7 +3277,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3290,7 +3290,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3310,7 +3310,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -3323,7 +3323,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3336,7 +3336,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3348,7 +3348,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3360,7 +3360,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3373,7 +3373,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3386,7 +3386,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3406,7 +3406,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -3419,7 +3419,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3432,7 +3432,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3456,7 +3456,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3469,7 +3469,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3482,7 +3482,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3502,7 +3502,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -3515,7 +3515,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3528,7 +3528,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3540,7 +3540,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3552,7 +3552,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3565,7 +3565,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3578,7 +3578,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3598,7 +3598,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3612,7 +3612,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3625,7 +3625,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0 @@ -3638,7 +3638,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3651,7 +3651,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3664,7 +3664,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3677,7 +3677,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3697,7 +3697,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3711,7 +3711,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3724,7 +3724,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0 @@ -3737,7 +3737,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3750,7 +3750,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3763,7 +3763,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3776,7 +3776,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3796,7 +3796,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3810,7 +3810,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3823,7 +3823,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0 @@ -3836,7 +3836,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3849,7 +3849,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3862,7 +3862,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3875,7 +3875,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3895,7 +3895,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3909,7 +3909,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3922,7 +3922,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0 @@ -3935,7 +3935,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3948,7 +3948,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3961,7 +3961,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3974,7 +3974,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3994,7 +3994,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -4008,7 +4008,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4021,7 +4021,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0 @@ -4034,7 +4034,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -4047,7 +4047,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4060,7 +4060,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4073,7 +4073,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -4093,7 +4093,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -4107,7 +4107,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4120,7 +4120,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0 @@ -4133,7 +4133,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -4146,7 +4146,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4159,7 +4159,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4172,7 +4172,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index b5b8213bcd57ee..10381bc21ecc96 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -2176,7 +2176,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc @@ -2186,7 +2186,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX10-LABEL: global_inst_salu_offset_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc dlc @@ -2196,7 +2196,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX11-LABEL: global_inst_salu_offset_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc @@ -2208,7 +2208,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_salu_offset_1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS @@ -2226,7 +2226,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc @@ -2236,7 +2236,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2246,7 +2246,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc @@ -2258,7 +2258,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS @@ -2276,7 +2276,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2286,7 +2286,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2296,7 +2296,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2308,7 +2308,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS @@ -2326,7 +2326,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2336,7 +2336,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2346,7 +2346,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_13bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2358,7 +2358,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS @@ -2376,7 +2376,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc @@ -2386,7 +2386,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc dlc @@ -2396,7 +2396,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX11-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc @@ -2408,7 +2408,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS @@ -2426,7 +2426,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc @@ -2436,7 +2436,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2449,7 +2449,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX11-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc @@ -2461,7 +2461,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS @@ -2473,7 +2473,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2490,7 +2490,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 @@ -2502,7 +2502,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2515,7 +2515,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2530,7 +2530,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS @@ -2542,7 +2542,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2553,7 +2553,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2573,7 +2573,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2583,7 +2583,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2593,7 +2593,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2605,7 +2605,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS @@ -2623,7 +2623,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2633,7 +2633,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2643,7 +2643,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2655,7 +2655,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS @@ -2673,7 +2673,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2683,7 +2683,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2693,7 +2693,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2705,7 +2705,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS @@ -2723,7 +2723,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc @@ -2733,7 +2733,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2746,7 +2746,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX11-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc @@ -2758,7 +2758,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS @@ -2770,7 +2770,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2787,7 +2787,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 @@ -2799,7 +2799,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2812,7 +2812,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2827,7 +2827,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS @@ -2839,7 +2839,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2850,7 +2850,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2870,7 +2870,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffc000 @@ -2882,7 +2882,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2895,7 +2895,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2910,7 +2910,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS @@ -2922,7 +2922,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2933,7 +2933,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2954,7 +2954,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff @@ -2966,7 +2966,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2979,7 +2979,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2994,7 +2994,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3009,7 +3009,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3020,7 +3020,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3034,7 +3034,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3055,7 +3055,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x800 @@ -3067,7 +3067,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3080,7 +3080,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3095,7 +3095,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3110,7 +3110,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3121,7 +3121,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3135,7 +3135,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3156,7 +3156,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xfff @@ -3168,7 +3168,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3181,7 +3181,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3196,7 +3196,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3211,7 +3211,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3222,7 +3222,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3236,7 +3236,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3257,7 +3257,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3269,7 +3269,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3282,7 +3282,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3297,7 +3297,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3312,7 +3312,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3323,7 +3323,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3337,7 +3337,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3358,7 +3358,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3370,7 +3370,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3383,7 +3383,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3398,7 +3398,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3413,7 +3413,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3424,7 +3424,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3438,7 +3438,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3459,7 +3459,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3471,7 +3471,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3484,7 +3484,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3499,7 +3499,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3514,7 +3514,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3525,7 +3525,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3539,7 +3539,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3560,7 +3560,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3572,7 +3572,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3584,7 +3584,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3598,7 +3598,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff @@ -3612,7 +3612,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x7ff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3634,7 +3634,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x800 @@ -3646,7 +3646,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 @@ -3658,7 +3658,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x800 @@ -3672,7 +3672,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 @@ -3686,7 +3686,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x800 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3708,7 +3708,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xfff @@ -3720,7 +3720,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff @@ -3732,7 +3732,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0xfff @@ -3746,7 +3746,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff @@ -3760,7 +3760,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xfff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3782,7 +3782,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3794,7 +3794,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3806,7 +3806,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3820,7 +3820,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 @@ -3834,7 +3834,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1000 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3868,7 +3868,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3880,7 +3880,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3894,7 +3894,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff @@ -3908,7 +3908,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1fff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3930,7 +3930,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3942,7 +3942,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3954,7 +3954,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3968,7 +3968,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 @@ -3982,7 +3982,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x2000 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll index df15f98ae27ff6..769d035858ca83 100644 --- a/llvm/test/CodeGen/AMDGPU/omod.ll +++ b/llvm/test/CodeGen/AMDGPU/omod.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; SI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -25,7 +25,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -43,14 +43,13 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -59,14 +58,13 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -86,7 +84,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; SI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -103,7 +101,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -121,14 +119,13 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -137,14 +134,13 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -164,7 +160,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -181,7 +177,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -199,14 +195,13 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -215,14 +210,13 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -242,7 +236,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #5 { ; SI-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -259,7 +253,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -277,14 +271,13 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -293,14 +286,13 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll index d73b1bd29c9813..bd7f9014d55cae 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll @@ -4,14 +4,14 @@ define amdgpu_kernel void @if_masked_1(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp0_b32 s4, 0 -; GCN-NEXT: s_cselect_b32 s2, 22, 33 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_cselect_b32 s0, 22, 33 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm %and = and i32 %arg, 1 %cmp = icmp eq i32 %and, 0 @@ -23,14 +23,14 @@ define amdgpu_kernel void @if_masked_1(i32 %arg, ptr addrspace(1) %p) { define amdgpu_kernel void @if_masked_1024(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_1024: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp0_b32 s4, 10 -; GCN-NEXT: s_cselect_b32 s2, 22, 33 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_cselect_b32 s0, 22, 33 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm %and = and i32 %arg, 1024 %cmp = icmp eq i32 %and, 0 @@ -42,14 +42,14 @@ define amdgpu_kernel void @if_masked_1024(i32 %arg, ptr addrspace(1) %p) { define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_0x80000000: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp0_b32 s4, 31 -; GCN-NEXT: s_cselect_b32 s2, 22, 33 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_cselect_b32 s0, 22, 33 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm %and = and i32 %arg, 2147483648 %cmp = icmp eq i32 %and, 0 @@ -62,7 +62,7 @@ define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p) define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_0x8000000000000000: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index 4ee2b8e981f449..a50a0766f67c2c 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-LABEL: negated_cond: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_mov_b32 s6, 0 @@ -92,7 +92,7 @@ bb4: define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) { ; GCN-LABEL: negated_cond_dominated_blocks: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s6, 0 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index eff80236d98663..0473f803bfb30e 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -70,7 +70,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -92,7 +92,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: scalar_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -156,7 +156,7 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX8-LABEL: scalar_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -185,40 +185,40 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) { ; GFX6-LABEL: vector_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s12, s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s12, s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s6 -; GFX6-NEXT: s_mov_b32 s1, s7 -; GFX6-NEXT: s_mov_b32 s3, s11 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s12, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s12, s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s0, s6 -; GFX8-NEXT: s_mov_b32 s1, s7 -; GFX8-NEXT: s_mov_b32 s3, s11 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s11, s3 +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i32: @@ -246,8 +246,8 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: scalar_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -258,8 +258,8 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; ; GFX8-LABEL: scalar_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -286,8 +286,8 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_literal_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -300,8 +300,8 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; ; GFX8-LABEL: scalar_or_literal_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -332,43 +332,43 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: scalar_or_literal_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d ; GFX6-NEXT: s_movk_i32 s8, 0x3039 ; GFX6-NEXT: s_mov_b32 s9, 0xf237b -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6-NEXT: s_add_u32 s0, s6, 0x3039 -; GFX6-NEXT: s_addc_u32 s1, s7, 0xf237b +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_add_u32 s0, s0, 0x3039 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_addc_u32 s1, s1, 0xf237b ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_literal_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 ; GFX8-NEXT: s_movk_i32 s8, 0x3039 ; GFX8-NEXT: s_mov_b32 s9, 0xf237b ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_add_u32 s0, s2, 0x3039 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_add_u32 s0, s0, 0x3039 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8-NEXT: s_addc_u32 s1, s3, 0xf237b +; GFX8-NEXT: s_addc_u32 s1, s1, 0xf237b ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -408,8 +408,8 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -421,8 +421,8 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; ; GFX8-LABEL: scalar_or_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -451,44 +451,44 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b32 s2, s6, 63 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_or_b32 s4, s6, 63 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: s_add_u32 s0, s0, 63 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 -; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_add_u32 s0, s8, 63 +; GFX6-NEXT: s_addc_u32 s1, s9, 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s2, s6, 63 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_or_b32 s4, s6, 63 +; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_add_u32 s0, s0, 63 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_add_u32 s0, s8, 63 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm ; @@ -521,8 +521,8 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_neg_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, -1 @@ -534,8 +534,8 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; ; GFX8-LABEL: scalar_or_neg_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 @@ -565,7 +565,7 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -583,7 +583,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -624,7 +624,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_inline_immediate_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -642,7 +642,7 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; ; GFX8-LABEL: vector_or_inline_immediate_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -683,8 +683,8 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -698,8 +698,8 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX8-LABEL: scalar_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -730,48 +730,48 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 -; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s6 ; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s10 -; GFX6-NEXT: s_mov_b32 s15, s11 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s14, s2 +; GFX6-NEXT: s_mov_b32 s15, s3 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 -; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_mov_b32 s11, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s10 -; GFX8-NEXT: s_mov_b32 s15, s11 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s14, s2 +; GFX8-NEXT: s_mov_b32 s15, s3 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i64: @@ -803,42 +803,42 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) { ; GFX6-LABEL: scalar_vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s6 -; GFX6-NEXT: s_mov_b32 s1, s7 -; GFX6-NEXT: s_mov_b32 s3, s11 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s0, s6 -; GFX8-NEXT: s_mov_b32 s1, s7 -; GFX8-NEXT: s_mov_b32 s3, s11 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s11, s3 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: scalar_vector_or_i64: @@ -867,7 +867,7 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_loadimm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -886,7 +886,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_i64_loadimm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -931,7 +931,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -949,7 +949,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: vector_or_i64_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -990,7 +990,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_inline_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; ; GFX8-LABEL: vector_or_i64_neg_inline_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1053,7 +1053,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_literal: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: vector_or_i64_neg_literal: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1116,9 +1116,9 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: trunc_i64_or_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 -; GFX6-NEXT: s_load_dword s5, s[2:3], 0x1d -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX6-NEXT: s_load_dword s5, s[0:1], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1129,9 +1129,9 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; ; GFX8-LABEL: trunc_i64_or_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX8-NEXT: s_load_dword s5, s[2:3], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1159,21 +1159,21 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; GFX6-LABEL: or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 -; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s6 ; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s10 -; GFX6-NEXT: s_mov_b32 s15, s11 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s14, s2 +; GFX6-NEXT: s_mov_b32 s15, s3 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(1) ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1181,26 +1181,26 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX6-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 -; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_mov_b32 s11, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s10 -; GFX8-NEXT: s_mov_b32 s15, s11 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s14, s2 +; GFX8-NEXT: s_mov_b32 s15, s3 +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX8-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: or_i1: @@ -1244,8 +1244,8 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GFX6-LABEL: s_or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1260,8 +1260,8 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; ; GFX8-LABEL: s_or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll index 5792fab7011afe..e21b93a386c3e7 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX8-LABEL: s_pack_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX7-LABEL: s_pack_v2f16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -64,7 +64,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2f16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -76,7 +76,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX8-LABEL: s_pack_v2f16_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -89,7 +89,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX7-LABEL: s_pack_v2f16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; GFX9-LABEL: s_pack_v2f16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -125,7 +125,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX8-LABEL: s_pack_v2f16_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -138,7 +138,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX7-LABEL: s_pack_v2f16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -162,7 +162,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -178,7 +178,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX8-LABEL: v_pack_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -200,7 +200,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX7-LABEL: v_pack_v2f16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -240,7 +240,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -258,7 +258,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX8-LABEL: v_pack_v2f16_user: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -282,7 +282,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX7-LABEL: v_pack_v2f16_user: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -324,7 +324,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -339,7 +339,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX8-LABEL: v_pack_v2f16_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -356,7 +356,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX7-LABEL: v_pack_v2f16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -386,7 +386,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX8-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -418,7 +418,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX7-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -448,7 +448,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -463,7 +463,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX8-LABEL: v_pack_v2f16_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -480,7 +480,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX7-LABEL: v_pack_v2f16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -510,7 +510,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -525,7 +525,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; ; GFX8-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -542,7 +542,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; ; GFX7-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -572,7 +572,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -586,7 +586,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX8-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -603,7 +603,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX7-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll index 529e64715500dd..4b21493bd7ca66 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX803-LABEL: s_pack_v2i16: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX7-LABEL: s_pack_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -62,7 +62,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2i16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -74,7 +74,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX803-LABEL: s_pack_v2i16_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) @@ -87,7 +87,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX7-LABEL: s_pack_v2i16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -110,7 +110,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; GFX9-LABEL: s_pack_v2i16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -122,7 +122,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX803-LABEL: s_pack_v2i16_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) @@ -135,7 +135,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX7-LABEL: s_pack_v2i16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -158,7 +158,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX803-LABEL: v_pack_v2i16: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -196,7 +196,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX7-LABEL: v_pack_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -234,7 +234,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -252,7 +252,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX803-LABEL: v_pack_v2i16_user: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -276,7 +276,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX7-LABEL: v_pack_v2i16_user: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -316,7 +316,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -331,7 +331,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX803-LABEL: v_pack_v2i16_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -348,7 +348,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX7-LABEL: v_pack_v2i16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -377,7 +377,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -391,7 +391,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX803-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX7-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -437,7 +437,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2i16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -452,7 +452,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX803-LABEL: v_pack_v2i16_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -469,7 +469,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX7-LABEL: v_pack_v2i16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -498,7 +498,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -512,7 +512,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX803-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -529,7 +529,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX7-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll index c72a7ba3eee834..a3f7906a05f6b1 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -803,5 +803,5 @@ bb: declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 58b61510c24e8b..45fbaaabc65b58 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -111,4 +111,4 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) -attributes #0 = { nounwind "amdgpu-num-vgpr"="5" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind "amdgpu-num-vgpr"="5" } diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll index 560f0a06798102..8d180e7d33f84f 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll @@ -97,7 +97,7 @@ define <2 x i16> @trunc_srl_v2i64_16_to_v2i16(<2 x i64> %x) { define amdgpu_kernel void @s_trunc_srl_i64_16_to_i16(i64 %x) { ; GCN-LABEL: s_trunc_srl_i64_16_to_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s0, 4 diff --git a/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll b/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll index 8f450e5bcb83f3..031a46271f2c0e 100644 --- a/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll @@ -4,10 +4,10 @@ declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) ; OBJ-LABEL: : -; OBJ: v_permlane16_b32 v0, v0, s5, s6 op_sel:[1,0] +; OBJ: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; ASM-LABEL: permlane_op_sel: -; ASM: v_permlane16_b32 v0, v0, s5, s6 op_sel:[1,0] ; encoding: [0x00,0x08,0x77,0xd7,0x00,0x0b,0x18,0x00] +; ASM: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; encoding: [0x00,0x08,0x77,0xd7,0x00,0x0f,0x00,0x00] define amdgpu_kernel void @permlane_op_sel(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0) store i32 %v, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll b/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll index 4ae0547d11fff3..caa7fb8df19904 100644 --- a/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll +++ b/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll @@ -8,7 +8,6 @@ declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y() - define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec @@ -46,9 +45,8 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 } define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { -; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec -; SDAG-GFX11: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec -; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -126,8 +124,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src } define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -170,8 +167,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i3 } define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -180,8 +176,7 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 % } define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -191,8 +186,7 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -201,8 +195,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr } define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -212,8 +205,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -223,8 +215,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 4, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -234,8 +225,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i } define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -244,8 +234,7 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -255,8 +244,7 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3 } define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -265,8 +253,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s } define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -276,8 +263,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -287,8 +273,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 4, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll index 69ddc9a48dbc43..6cab2b18393070 100644 --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -4,17 +4,17 @@ define amdgpu_kernel void @lsh8_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh8_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x6050400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -31,17 +31,17 @@ bb: define amdgpu_kernel void @lsr24_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsr24_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7060503 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, s2, v2, v3 +; GCN-NEXT: v_perm_b32 v2, s0, v2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -58,17 +58,17 @@ bb: define amdgpu_kernel void @and_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7060503 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -87,17 +87,17 @@ bb: define amdgpu_kernel void @and_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020500 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -115,17 +115,17 @@ bb: define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh8_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x2010007 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, s2, v2, v3 +; GCN-NEXT: v_perm_b32 v2, s0, v2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -142,17 +142,17 @@ bb: define amdgpu_kernel void @lsh16_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh16_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x5040c03 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -169,17 +169,17 @@ bb: define amdgpu_kernel void @and_xor_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_xor_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -197,15 +197,15 @@ bb: define amdgpu_kernel void @and_or_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: s_and_b32 s0, s2, 0xff00 +; GCN-NEXT: s_and_b32 s0, s0, 0xff00 ; GCN-NEXT: s_or_b32 s0, s0, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 @@ -227,17 +227,17 @@ bb: define amdgpu_kernel void @and_or_and_shl(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_and_shl: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x50c0c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -255,17 +255,17 @@ bb: define amdgpu_kernel void @or_and_or(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: or_and_or: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -283,20 +283,20 @@ bb: define amdgpu_kernel void @known_ffff0500(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_ffff0500: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0xffff8004 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] -; GCN-NEXT: s_bitset1_b32 s2, 15 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: s_and_b32 s0, s2, 0xff00 +; GCN-NEXT: s_bitset1_b32 s0, 15 +; GCN-NEXT: s_and_b32 s0, s0, 0xff00 ; GCN-NEXT: s_or_b32 s0, s0, 0xffff0000 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v4, 4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xff00ff, v4 @@ -323,21 +323,21 @@ bb: define amdgpu_kernel void @known_050c0c00(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_050c0c00: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0x50c0c00 ; GCN-NEXT: v_mov_b32_e32 v6, 4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] -; GCN-NEXT: s_or_b32 s2, s2, 4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_or_b32 s0, s0, 4 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v4, v4, s2, v5 +; GCN-NEXT: v_perm_b32 v4, v4, s0, v5 ; GCN-NEXT: flat_store_dword v[0:1], v4 ; GCN-NEXT: flat_store_dword v[2:3], v6 ; GCN-NEXT: s_endpgm @@ -359,22 +359,22 @@ bb: define amdgpu_kernel void @known_ffff8004(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_ffff8004: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0xffff0500 ; GCN-NEXT: v_mov_b32_e32 v6, 0xffff8004 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] -; GCN-NEXT: s_or_b32 s2, s2, 4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_or_b32 s0, s0, 4 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_perm_b32 v4, v4, s2, v5 +; GCN-NEXT: v_perm_b32 v4, v4, s0, v5 ; GCN-NEXT: flat_store_dword v[0:1], v4 ; GCN-NEXT: flat_store_dword v[2:3], v6 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index bf98af33dc7b08..048a7756a7a048 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -609,53 +609,53 @@ define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %i ; GFX10-LABEL: shuffle8i8: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 8 ; GFX10-NEXT: s_lshr_b32 s4, s9, 16 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, s9 ; GFX10-NEXT: v_and_b32_e64 v1, 0xffffff00, s8 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, s4 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, s8 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: v_or_b32_sdwa v0, s3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: v_or_b32_sdwa v0, s1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: shuffle8i8: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffffff00 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s3, 8 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 ; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s9 -; GFX9-NEXT: v_or_b32_sdwa v4, s3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s3, s9, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NEXT: v_or_b32_sdwa v4, s1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_lshr_b32 s1, s9, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 8, s8 ; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s3 -; GFX9-NEXT: v_or_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s1 +; GFX9-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm bb: %vec0 = load <8 x i8>, ptr addrspace(1) %in0 diff --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll index f53ca53518a172..4794c296215253 100644 --- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll +++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @dbg_clause(ptr addrspace(1) %out, ptr addrspace(1) %aptr) !dbg !4 { ; GCN-LABEL: dbg_clause: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll index 5a03381447d0eb..a030f86da1b67d 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll @@ -8,7 +8,7 @@ ; NON-HSA: s_endpgm ; ASM: .fill 63, 4, 0xbf800000 ; s_nop 0 ; OBJ-COUNT-63: s_nop 0 -define amdgpu_kernel void @preload_kernarg_header(ptr inreg %arg) { +define amdgpu_kernel void @preload_kernarg_header(ptr %arg) { store ptr %arg, ptr %arg ret void } diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index a547c258e3921d..e076df97e1ba49 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -1,18 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s -define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i8: +define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -23,51 +19,27 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 { ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i8: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i8: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i8: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i8: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i8: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -78,56 +50,32 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 { ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i8: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i8: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i8: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i8: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg: +define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -138,51 +86,29 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -193,56 +119,34 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg: +define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -253,51 +157,27 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xffff -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xffff -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -308,56 +188,32 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg: +define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -367,47 +223,25 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -417,52 +251,30 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i32 %arg0, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 { -; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: +define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 { +; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dword s5, s[0:1], 0x0 @@ -474,55 +286,29 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-1-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s5, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s5, s4 +; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s2, s0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-4-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s5, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s5, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -534,60 +320,34 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-1-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s6, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-4-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s6, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %add = add i32 %arg0, %arg1 store i32 %add, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: +define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -600,59 +360,33 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-1-NEXT: s_and_b32 s1, s4, 0xffff -; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s1, s0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 ; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-4-NEXT: s_and_b32 s1, s4, 0xffff -; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s1, s0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 ; GFX940-PRELOAD-8-NEXT: s_and_b32 s1, s4, 0xffff ; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -665,56 +399,30 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s1, s8, 0xffff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 +; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s1, s8, 0xffff +; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 %ext1 = zext i16 %arg1 to i32 @@ -723,8 +431,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ret void } -define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: +define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -734,47 +442,29 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-2-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -784,52 +474,34 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-2-NEXT: global_store_short v1, v0, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <2 x i8> %in, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { -; GFX940-NO-PRELOAD-LABEL: byref_preload_arg: +define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { +; GFX940-NO-PRELOAD-LABEL: byref_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -843,63 +515,37 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: byref_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: byref_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: byref_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: byref_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: byref_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: byref_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: byref_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 @@ -913,59 +559,33 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] -; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[2:3] -; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-LABEL: byref_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] -; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[2:3] -; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-LABEL: byref_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) ; GFX90a-PRELOAD-8-NEXT: s_endpgm %in = load i32, ptr addrspace(4) %in.byref @@ -975,8 +595,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac } -define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v8i32_arg: +define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 @@ -995,83 +615,47 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v8i32_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_nop 1 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v8i32_arg: -; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_nop 1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v8i32_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_nop 1 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v8i32_arg: -; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_nop 1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v8i32_arg: +; GFX90a-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -1090,87 +674,51 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v8i32_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v8i32_arg: -; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-2-NEXT: s_nop 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v8i32_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v8i32_arg: -; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-8-NEXT: s_nop 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <8 x i32> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg: +define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1181,51 +729,29 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1236,55 +762,33 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x i16> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg: +define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -1296,55 +800,29 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -1356,59 +834,33 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x i32> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg: +define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -1420,55 +872,29 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -1480,59 +906,33 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x float> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg: +define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1543,51 +943,43 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s5 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-2-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s5 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-8-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1598,55 +990,47 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s9 +; GFX90a-PRELOAD-2-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v1, v0, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v1, v0, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <5 x i8> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v5f64_arg: +define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -1668,95 +1052,53 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v5f64_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_nop 1 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v5f64_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 +; GFX940-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[12:13] ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_nop 1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v5f64_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_nop 1 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v5f64_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 +; GFX940-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[12:13] ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_nop 1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v5f64_arg: +; GFX90a-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -1778,99 +1120,57 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v5f64_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v5f64_arg: -; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-2-NEXT: s_nop 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v5f64_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v5f64_arg: -; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-8-NEXT: s_nop 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <5 x double> %in, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg: +define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 @@ -1879,43 +1179,57 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 @@ -1924,40 +1238,52 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <8 x i8> %in, ptr addrspace(1) %out ret void @@ -1974,44 +1300,22 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: i64_kernel_preload_arg: @@ -2024,44 +1328,22 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i64 %a, ptr addrspace(1) %out, align 8 ret void @@ -2078,44 +1360,22 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: f64_kernel_preload_arg: @@ -2128,47 +1388,1137 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store double %in, ptr addrspace(1) %out ret void } -attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half %in) #0 { +; GFX940-NO-PRELOAD-LABEL: half_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: half_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: half_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: half_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: half_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: half_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store half %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bfloat %in) #0 { +; GFX940-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store bfloat %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 x bfloat> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <2 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <3 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 x bfloat> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <6 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s10, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s7 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s11 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[10:11] offset:12 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s10, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[6:7] +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s3 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x20 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s3 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[10:11] offset:12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[0:1] offset:12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store half %in, ptr addrspace(1) %out + store <7 x bfloat> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) #0 { +; GFX940-NO-PRELOAD-LABEL: i1_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i1_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i1_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i1_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 1 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i1_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i1_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 1 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i1 %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 %in) #0 { +; GFX940-NO-PRELOAD-LABEL: fp128_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: fp128_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: fp128_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: fp128_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: fp128_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s13 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: fp128_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s13 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store fp128 %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s5 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s5 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6 +; GFX90a-PRELOAD-2-NEXT: global_store_short v2, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v2, v0, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6 +; GFX90a-PRELOAD-8-NEXT: global_store_short v2, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v2, v0, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <7 x i8> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x half> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v7half_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v1, s[2:3] offset:12 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v7half_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s9 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v7half_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v7half_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v1, s[6:7] offset:12 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v7half_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s13 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v7half_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] offset:12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <7 x half> %in, ptr addrspace(1) %out + ret void +} + +; Test when previous argument was not dword aligned. +define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s7 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[4:5] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0xc +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s3 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[10:11] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store i32 %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: s_load_dword s7, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s7 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s8 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s8 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store <3 x i32> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[10:11] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store i16 %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: global_store_short v1, v2, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v2, s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[10:11] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store <2 x i8> %in2, ptr addrspace(1) %out2 + ret void +} + +attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll index 0d88466fc31b3e..6fdc0d5834ef6e 100644 --- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -513,8 +513,8 @@ define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(ptr addr ; ; GCN-LABEL: alloca_promote_atomicrmw_private_lds_promote: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -555,8 +555,8 @@ define amdgpu_kernel void @alloca_promote_cmpxchg_private(ptr addrspace(1) %out, ; ; GCN-LABEL: alloca_promote_cmpxchg_private: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index cf7efed46cef55..b6afb7cf8c9a11 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -14,13 +14,13 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -95,13 +95,13 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -165,14 +165,14 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -233,15 +233,15 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: clmem_read_simplified: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -346,13 +346,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -473,13 +473,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX900-NEXT: s_mov_b32 s38, -1 ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s9 +; GFX900-NEXT: s_add_u32 s36, s36, s3 ; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX900-NEXT: s_getpc_b64 s[0:1] ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -589,14 +589,14 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -701,13 +701,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX90A-NEXT: s_mov_b32 s38, -1 ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s9 +; GFX90A-NEXT: s_add_u32 s36, s36, s3 ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX90A-NEXT: s_getpc_b64 s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -811,15 +811,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: clmem_read: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 17, v0 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xff, v0 ; GFX11-NEXT: s_movk_i32 s1, 0x7f @@ -1033,13 +1033,13 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1119,13 +1119,13 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1176,14 +1176,14 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1238,15 +1238,15 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: Address32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1348,13 +1348,13 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1401,13 +1401,13 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1450,14 +1450,14 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1496,15 +1496,15 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: Offset64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1574,13 +1574,13 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1624,13 +1624,13 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1669,14 +1669,14 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1709,15 +1709,15 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: p32Offset64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1781,13 +1781,13 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX8-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s42, -1 ; GFX8-NEXT: s_mov_b32 s43, 0xe80000 -; GFX8-NEXT: s_add_u32 s40, s40, s9 +; GFX8-NEXT: s_add_u32 s40, s40, s3 ; GFX8-NEXT: s_addc_u32 s41, s41, 0 +; GFX8-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43] @@ -1844,13 +1844,13 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s42, -1 ; GFX9-NEXT: s_mov_b32 s43, 0xe00000 -; GFX9-NEXT: s_add_u32 s40, s40, s9 +; GFX9-NEXT: s_add_u32 s40, s40, s3 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] @@ -1903,14 +1903,14 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX10-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s42, -1 ; GFX10-NEXT: s_mov_b32 s43, 0x31c16000 -; GFX10-NEXT: s_add_u32 s40, s40, s9 +; GFX10-NEXT: s_add_u32 s40, s40, s3 ; GFX10-NEXT: s_addc_u32 s41, s41, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX10-NEXT: s_mov_b64 s[2:3], s[42:43] @@ -1958,15 +1958,15 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; ; GFX11-LABEL: DiffBase: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff8000, v0 @@ -2051,13 +2051,13 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2132,13 +2132,13 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2201,14 +2201,14 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2273,15 +2273,15 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: ReverseOrder: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2387,13 +2387,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2429,13 +2429,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2470,14 +2470,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2507,15 +2507,15 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; ; GFX11-LABEL: negativeoffset: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index 9a8d5acfbe3e96..5bb260c09c9ddb 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr addrspace(8) noalias %b) { ; SDAG-LABEL: buffers_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a ; ; GISEL-LABEL: buffers_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -50,7 +50,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) { ; SDAG-LABEL: buffers_from_flat_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-NEXT: s_mov_b32 s7, 0 ; SDAG-NEXT: s_mov_b32 s6, 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -69,7 +69,7 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr ; ; GISEL-LABEL: buffers_from_flat_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_mov_b32 s7, 0 ; GISEL-NEXT: s_mov_b32 s6, 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -110,7 +110,7 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) { ; SDAG-LABEL: buffers_might_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -132,7 +132,7 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac ; ; GISEL-LABEL: buffers_might_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -173,7 +173,7 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; SDAG-LABEL: independent_offsets: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; ; GISEL-LABEL: independent_offsets: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GISEL-NEXT: v_mov_b32_e32 v2, 1.0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll index 92465420a1ae73..74bad5ea3edce5 100644 --- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll +++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -748,21 +748,21 @@ define float @v_rcp_neg_fabs_f32_daz_ulp25(float %x) #0 { define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -800,21 +800,21 @@ define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -852,21 +852,21 @@ define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -904,21 +904,21 @@ define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, f define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -956,21 +956,21 @@ define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, f define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #2 { ; SI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1008,21 +1008,21 @@ define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, |s4| +; SI-NEXT: v_rcp_f32_e64 v0, |s2| +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fabs_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, |s4| +; VI-NEXT: v_rcp_f32_e64 v2, |s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1061,21 +1061,21 @@ define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float % define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_neg_rcp_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, -s4 +; SI-NEXT: v_rcp_f32_e64 v0, -s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_neg_rcp_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -s4 +; VI-NEXT: v_rcp_f32_e64 v2, -s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1116,21 +1116,21 @@ define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %s define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, -|s4| +; SI-NEXT: v_rcp_f32_e64 v0, -|s2| +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -|s4| +; VI-NEXT: v_rcp_f32_e64 v2, -|s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1173,8 +1173,8 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, fl define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1188,13 +1188,13 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1 ; ; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -|s4| +; VI-NEXT: v_rcp_f32_e64 v2, -|s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mul_f32_e64 v3, s4, -|s4| +; VI-NEXT: v_mul_f32_e64 v3, s2, -|s2| ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 @@ -1244,7 +1244,7 @@ define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; SI-LABEL: s_div_arcp_2_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1254,10 +1254,10 @@ define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; ; VI-LABEL: s_div_arcp_2_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, s4, 0.5 +; VI-NEXT: v_mul_f32_e64 v2, s2, 0.5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; SI-LABEL: s_div_arcp_k_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1309,11 +1309,11 @@ define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; ; VI-LABEL: s_div_arcp_k_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v2, s4, v0 +; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1355,7 +1355,7 @@ define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) ; SI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1366,11 +1366,11 @@ define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) ; ; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v2, s4, v0 +; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index b1fa85f7c675b7..24e420b7d657bf 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -3212,72 +3212,71 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %i21, ptr addrspace(1) nocapture noundef writeonly align 4 %arg, i32 noundef %arg1) #1 { ; GFX67-LABEL: compute_mad: ; GFX67: ; %bb.0: ; %bb -; GFX67-NEXT: s_load_dword s0, s[2:3], 0x6 -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX67-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4 -; GFX67-NEXT: s_mov_b32 s7, 0xf000 +; GFX67-NEXT: s_load_dword s3, s[0:1], 0x6 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_add_i32 s0, s0, 1 -; GFX67-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX67-NEXT: v_add_i32_e32 v2, vcc, s0, v1 +; GFX67-NEXT: s_load_dword s6, s[6:7], 0x1 +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX67-NEXT: s_add_i32 s3, s3, 1 +; GFX67-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX67-NEXT: s_waitcnt lgkmcnt(0) +; GFX67-NEXT: s_and_b32 s6, s6, 0xffff +; GFX67-NEXT: s_mul_i32 s2, s2, s6 +; GFX67-NEXT: v_add_i32_e32 v2, vcc, s3, v1 ; GFX67-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-NEXT: s_load_dword s2, s[10:11], 0x1 -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX67-NEXT: v_mul_lo_u32 v3, v2, v1 -; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_and_b32 s2, s2, 0xffff +; GFX67-NEXT: s_mov_b32 s3, 0xf000 +; GFX67-NEXT: s_mov_b32 s2, 0 ; GFX67-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX67-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v3 -; GFX67-NEXT: s_mul_i32 s6, s6, s2 ; GFX67-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX67-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GFX67-NEXT: s_mov_b32 s6, 0 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX67-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX67-NEXT: v_mov_b32_e32 v2, s1 +; GFX67-NEXT: v_mov_b32_e32 v2, s5 ; GFX67-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX67-NEXT: v_add_i32_e32 v3, vcc, v3, v1 ; GFX67-NEXT: v_mul_lo_u32 v4, v3, v1 -; GFX67-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX67-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX67-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, v4, v3 -; GFX67-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX67-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: compute_mad: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x18 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x18 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_i32 s0, s0, 1 -; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_add_i32 s3, s3, 1 +; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s3, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1 -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x4 ; GFX8-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[10:11], 0x4 +; GFX8-NEXT: s_and_b32 s1, s3, 0xffff ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX8-NEXT: s_mul_i32 s2, s2, s1 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s4, 0xffff -; GFX8-NEXT: s_mul_i32 s6, s6, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v3 @@ -3288,104 +3287,102 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) % ; ; GFX900-LABEL: compute_mad: ; GFX900: ; %bb.0: ; %bb -; GFX900-NEXT: s_load_dword s0, s[2:3], 0x18 +; GFX900-NEXT: s_load_dword s3, s[0:1], 0x18 +; GFX900-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x10 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_add_i32 s0, s0, 1 -; GFX900-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX900-NEXT: v_add_u32_e32 v2, s0, v1 +; GFX900-NEXT: s_add_i32 s3, s3, 1 +; GFX900-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, s9 +; GFX900-NEXT: v_add_u32_e32 v2, s3, v1 ; GFX900-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX900-NEXT: v_add_u32_e32 v1, 1, v1 -; GFX900-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s4, s[10:11], 0x4 -; GFX900-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dword s3, s[6:7], 0x4 +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX900-NEXT: v_mul_lo_u32 v3, v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, s1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_and_b32 s1, s4, 0xffff +; GFX900-NEXT: s_and_b32 s3, s3, 0xffff +; GFX900-NEXT: s_mul_i32 s2, s2, s3 ; GFX900-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX900-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX900-NEXT: v_add_u32_e32 v2, 1, v3 -; GFX900-NEXT: s_mul_i32 s6, s6, s1 -; GFX900-NEXT: v_add_u32_e32 v0, s6, v0 +; GFX900-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, s1 ; GFX900-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, s3 ; GFX900-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX900-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX900-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v3, v[1:2] -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX900-NEXT: v_mad_u64_u32 v[2:3], s[2:3], v1, v3, v[1:2] +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s0, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX900-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] -; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, v1, v[2:3] -; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s0, v3 +; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v1, v[2:3] +; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s8, v3 ; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc ; GFX900-NEXT: global_store_dword v[1:2], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX90A-LABEL: compute_mad: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x18 -; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX90A-NEXT: s_load_dword s3, s[0:1], 0x18 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x10 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_add_i32 s4, s4, 1 -; GFX90A-NEXT: v_mul_lo_u32 v0, s4, v4 -; GFX90A-NEXT: v_add_u32_e32 v1, s4, v0 -; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v4 -; GFX90A-NEXT: v_add_u32_e32 v0, 1, v0 -; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v0 -; GFX90A-NEXT: v_add_u32_e32 v0, v2, v0 -; GFX90A-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX90A-NEXT: v_add_u32_e32 v1, 1, v2 -; GFX90A-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX90A-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX90A-NEXT: s_load_dword s7, s[10:11], 0x4 -; GFX90A-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[2:3], v0, v2, v[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 -; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[2:3] +; GFX90A-NEXT: s_add_i32 s3, s3, 1 +; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, s3, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, v2, v1 +; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX90A-NEXT: v_add_u32_e32 v2, 1, v3 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v2 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: s_load_dword s3, s[6:7], 0x4 +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v1 +; GFX90A-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, v3, v[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s4, s7, 0xffff -; GFX90A-NEXT: s_mul_i32 s6, s6, s4 -; GFX90A-NEXT: v_add_u32_e32 v1, s6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s2, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: s_and_b32 s3, s3, 0xffff +; GFX90A-NEXT: s_mul_i32 s2, s2, s3 +; GFX90A-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v2, v[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX90A-NEXT: global_store_dword v[2:3], v0, off +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v3, s9 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX90A-NEXT: global_store_dword v[0:1], v2, off ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: compute_mad: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x18 -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x18 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s0, s0, 1 -; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v2, s0, v1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX10-NEXT: s_add_i32 s3, s3, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, s3, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX10-NEXT: s_load_dword s4, s[10:11], 0x4 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s4, 0xffff +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff ; GFX10-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, v2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s6, s4, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s2, s3, v[0:1] ; GFX10-NEXT: v_mul_lo_u32 v1, v3, v2 -; GFX10-NEXT: v_add_co_u32 v2, s2, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s2 +; GFX10-NEXT: v_add_co_u32 v2, s2, s4, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, s5, 0, s2 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], null, v1, v4, v[1:2] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v4, v1, v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll index f57e86c68ebf98..0c67f00d7bebf7 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll @@ -35,6 +35,8 @@ define <4 x float> @needs_extimg(float noundef %0, float noundef %1, <8 x i32> n ; IR: define void @caller( define void @caller(float noundef %0, float noundef %1, <8 x i32> noundef %2, <4 x i32> noundef %3) { + ; EXTIMG: call void @needs_extimg( + ; NOEXTIMG: call void null call void @needs_extimg(float %0, float %1, <8 x i32> %2, <4 x i32> %3) ; IR: ret void ret void @@ -43,6 +45,3 @@ define void @caller(float noundef %0, float noundef %1, <8 x i32> noundef %2, <4 declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) attributes #0 = { "target-features"="+extended-image-insts" } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; EXTIMG: {{.*}} -; NOEXTIMG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll index e0b694ee58f0ef..a0380c82d9aaf0 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll @@ -115,6 +115,11 @@ @ConstantExpr = internal global i64 ptrtoint (ptr @needs_dpp to i64) define void @needs_dpp(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #0 { +; GFX7-NOT: define void @needs_dpp( +; GFX8: define void @needs_dpp( +; GFX9: define void @needs_dpp( +; GFX10: define void @needs_dpp( +; GFX11: define void @needs_dpp( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -134,6 +139,11 @@ endif: } define void @needs_16bit_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #1 { +; GFX7-NOT: define void @needs_16bit_insts( +; GFX8: define void @needs_16bit_insts( +; GFX9: define void @needs_16bit_insts( +; GFX10: define void @needs_16bit_insts( +; GFX11: define void @needs_16bit_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -153,6 +163,11 @@ endif: } define void @needs_gfx8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #2 { +; GFX7-NOT: define void @needs_gfx8_insts( +; GFX8: define void @needs_gfx8_insts( +; GFX9: define void @needs_gfx8_insts( +; GFX10: define void @needs_gfx8_insts( +; GFX11: define void @needs_gfx8_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -172,6 +187,11 @@ endif: } define void @needs_gfx9_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #3 { +; GFX7-NOT: define void @needs_gfx9_insts( +; GFX8-NOT: define void @needs_gfx9_insts( +; GFX9: define void @needs_gfx9_insts( +; GFX10: define void @needs_gfx9_insts( +; GFX11: define void @needs_gfx9_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -191,6 +211,11 @@ endif: } define void @needs_gfx10_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #4 { +; GFX7-NOT: define void @needs_gfx10_insts( +; GFX8-NOT: define void @needs_gfx10_insts( +; GFX9-NOT: define void @needs_gfx10_insts( +; GFX10: define void @needs_gfx10_insts( +; GFX11: define void @needs_gfx10_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -210,6 +235,11 @@ endif: } define void @needs_gfx11_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #5 { +; GFX7-NOT: define void @needs_gfx11_insts( +; GFX8-NOT: define void @needs_gfx11_insts( +; GFX9-NOT: define void @needs_gfx11_insts( +; GFX10-NOT: define void @needs_gfx11_insts( +; GFX11: define void @needs_gfx11_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -229,18 +259,34 @@ endif: } define void @needs_dot1_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #6 { +; GFX7-NOT: define void @needs_dot1_insts( +; GFX8-NOT: define void @needs_dot1_insts( +; GFX9: define void @needs_dot1_insts( +; GFX10: define void @needs_dot1_insts( +; GFX11-NOT: define void @needs_dot1_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot2_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #7 { +; GFX7-NOT: define void @needs_dot2_insts( +; GFX8-NOT: define void @needs_dot2_insts( +; GFX9: define void @needs_dot2_insts( +; GFX10: define void @needs_dot2_insts( +; GFX11-NOT: define void @needs_dot2_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot3_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #8 { +; GFX7-NOT: define void @needs_dot3_insts( +; GFX8-NOT: define void @needs_dot3_insts( +; GFX906-NOT: define void @needs_dot3_insts( +; GFX90A: define void @needs_dot3_insts( +; GFX10-NOT: define void @needs_dot3_insts( +; GFX11-NOT: define void @needs_dot3_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void @@ -248,30 +294,58 @@ define void @needs_dot3_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #8 { define void @needs_dot4_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #9 { +; GFX7-NOT: define void @needs_dot4_insts( +; GFX8-NOT: define void @needs_dot4_insts( +; GFX906-NOT: define void @needs_dot4_insts( +; GFX90A: define void @needs_dot4_insts( +; GFX10-NOT: define void @needs_dot4_insts( +; GFX11-NOT: define void @needs_dot4_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot5_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #10 { +; GFX7-NOT: define void @needs_dot5_insts( +; GFX8-NOT: define void @needs_dot5_insts( +; GFX906-NOT: define void @needs_dot5_insts( +; GFX90A: define void @needs_dot5_insts( +; GFX10: define void @needs_dot5_insts( +; GFX11: define void @needs_dot5_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot6_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #11 { +; GFX7-NOT: define void @needs_dot6_insts( +; GFX8-NOT: define void @needs_dot6_insts( +; GFX906-NOT: define void @needs_dot6_insts( +; GFX90A: define void @needs_dot6_insts( +; GFX10: define void @needs_dot6_insts( +; GFX11-NOT: define void @needs_dot6_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot7_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #12 { +; GFX7-NOT: define void @needs_dot7_insts( +; GFX8-NOT: define void @needs_dot7_insts( +; GFX9: define void @needs_dot7_insts( +; GFX10: define void @needs_dot7_insts( +; GFX11: define void @needs_dot7_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #13 { +; GFX7-NOT: define void @needs_dot8_insts( +; GFX8-NOT: define void @needs_dot8_insts( +; GFX9-NOT: define void @needs_dot8_insts( +; GFX10-NOT: define void @needs_dot8_insts( +; GFX11: define void @needs_dot8_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void @@ -279,22 +353,95 @@ define void @needs_dot8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #13 { ; IR: define void @caller( define void @caller(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) { + ; GFX7: call void null( + ; GFX8: call void @needs_dpp( + ; GFX9: call void @needs_dpp( + ; GFX10: call void @needs_dpp( + ; GFX11: call void @needs_dpp( call void @needs_dpp(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void @needs_16bit_insts( + ; GFX9: call void @needs_16bit_insts( + ; GFX10: call void @needs_16bit_insts( + ; GFX11: call void @needs_16bit_insts( call void @needs_16bit_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void @needs_gfx8_insts( + ; GFX9: call void @needs_gfx8_insts( + ; GFX10: call void @needs_gfx8_insts( + ; GFX11: call void @needs_gfx8_insts( call void @needs_gfx8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void @needs_gfx9_insts( + ; GFX10: call void @needs_gfx9_insts( ; GFX111: call void @needs_gfx9_insts(c call void @needs_gfx9_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void null( + ; GFX10: call void @needs_gfx10_insts( ; GFX111: call void @needs_gfx10_insts( call void @needs_gfx10_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void null( + ; GFX10: call void null( + ; GFX11: call void @needs_gfx11_insts( call void @needs_gfx11_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void @needs_dot1_insts( + ; GFX10: call void @needs_dot1_insts( + ; GFX11: call void null( call void @needs_dot1_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void @needs_dot2_insts( + ; GFX10: call void @needs_dot2_insts( + ; GFX11: call void null( call void @needs_dot2_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX906: call void null( + ; GFX90A: call void @needs_dot3_insts( + ; GFX10: call void null( + ; GFX11: call void null( call void @needs_dot3_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX906: call void null( + ; GFX90A: call void @needs_dot4_insts( + ; GFX10: call void null( + ; GFX11: call void null( call void @needs_dot4_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX906: call void null( + ; GFX90A: call void @needs_dot5_insts( + ; GFX10: call void @needs_dot5_insts( + ; GFX11: call void @needs_dot5_insts( call void @needs_dot5_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX906: call void null( + ; GFX90A: call void @needs_dot6_insts( + ; GFX10: call void @needs_dot6_insts( + ; GFX11: call void null( call void @needs_dot6_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void @needs_dot7_insts( + ; GFX10: call void @needs_dot7_insts( + ; GFX11: call void @needs_dot7_insts( call void @needs_dot7_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void null( + ; GFX10: call void null( + ; GFX11: call void @needs_dot8_insts( call void @needs_dot8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; IR: ret void ret void } @@ -312,12 +459,3 @@ attributes #10 = { "target-features"="+dot5-insts" } attributes #11 = { "target-features"="+dot6-insts" } attributes #12 = { "target-features"="+dot7-insts" } attributes #13 = { "target-features"="+dot8-insts" } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX7: {{.*}} -; GFX8: {{.*}} -; GFX9: {{.*}} -; GFX906: {{.*}} -; GFX90A: {{.*}} -; IR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll index 2b1e3999a8aa8a..594fad389b6b97 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll @@ -38,7 +38,10 @@ define void @needs_gws(i32 %val0, i32 %val1) #0 { ; IR: define void @gws_caller( define void @gws_caller(i32 %val0, i32 %val1) { + ; COMPATIBLE: call void @needs_gws( + ; INCOMPATIBLE: call void null call void @needs_gws(i32 %val0, i32 %val1) + ; IR: ret void ret void } @@ -49,7 +52,3 @@ declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2 attributes #0 = { "target-features"="+gws"} attributes #1 = { convergent inaccessiblememonly nounwind } attributes #2 = { convergent inaccessiblememonly nounwind writeonly } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; COMPATIBLE: {{.*}} -; INCOMPATIBLE: {{.*}} -; IR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll index 32fed3ba22c590..2c2401f120cf5e 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll @@ -41,6 +41,8 @@ define i64 @needs_s_memrealtime() #0 { ; IR: define void @s_memrealtime_caller( define i64 @s_memrealtime_caller() { + ; REALTIME: call i64 @needs_s_memrealtime( + ; NOREALTIME: call i64 null %t = call i64 @needs_s_memrealtime() ; IR: ret i64 %t ret i64 %t @@ -55,6 +57,8 @@ define i64 @needs_s_memtime() #1 { ; IR: define void @s_memtime_caller( define i64 @s_memtime_caller() { + ; MEMTIME: call i64 @needs_s_memtime( + ; NOMEMTIME: call i64 null %t = call i64 @needs_s_memtime() ; IR: ret i64 %t ret i64 %t @@ -66,10 +70,3 @@ declare i64 @llvm.amdgcn.s.memtime() attributes #0 = { "target-features"="+s-memrealtime"} attributes #1 = { "target-features"="+s-memtime-inst"} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; COMPATIBLE: {{.*}} -; INCOMPATIBLE: {{.*}} -; MEMTIME: {{.*}} -; NOMEMTIME: {{.*}} -; NOREALTIME: {{.*}} -; REALTIME: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index fdce4431fbbf25..a87973d93ac778 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -21,7 +21,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s3, 32, s3 @@ -35,7 +35,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotl_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -47,7 +47,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotl_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s3, 32, s3 @@ -57,7 +57,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: rotl_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s3, 32, s3 @@ -95,8 +95,8 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -111,8 +111,8 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX8-LABEL: rotl_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, 32, s6 ; GFX8-NEXT: s_sub_i32 s3, 32, s7 @@ -128,22 +128,22 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: rotl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s7 -; GFX10-NEXT: s_sub_i32 s3, 32, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s3 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_sub_i32 s0, 32, s7 +; GFX10-NEXT: s_sub_i32 s1, 32, s6 +; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s0 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, 32, s7 @@ -188,8 +188,8 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,8 +210,8 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX8-LABEL: rotl_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s9 ; GFX8-NEXT: s_sub_i32 s9, 32, s11 @@ -233,26 +233,26 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotl_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s8 -; GFX10-NEXT: s_sub_i32 s3, 32, s9 +; GFX10-NEXT: s_sub_i32 s0, 32, s8 +; GFX10-NEXT: s_sub_i32 s1, 32, s9 ; GFX10-NEXT: s_sub_i32 s8, 32, s11 ; GFX10-NEXT: s_sub_i32 s9, 32, s10 ; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s8 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s9 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s2 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, 32, s8 diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 0e1dd69d930ae5..058ee589bc4b09 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -19,7 +19,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -32,7 +32,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotr_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 @@ -43,7 +43,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotr_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 @@ -52,7 +52,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: rotr_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 @@ -84,8 +84,8 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -98,8 +98,8 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX8-LABEL: rotr_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 @@ -113,20 +113,20 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: rotr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s7 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s6 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s7 @@ -161,8 +161,8 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -179,8 +179,8 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX8-LABEL: rotr_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 @@ -198,22 +198,22 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s11 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s10 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s9 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s8 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s11 diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll index 40a8592dba6df0..846fbdb33d668d 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -20,7 +20,7 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: rsq_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -38,7 +38,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; GCN-IEEE-UNSAFE-LABEL: rsq_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -56,7 +56,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; GCN-DAZ-SAFE-LABEL: rsq_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; SI-IEEE-SAFE-LABEL: rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -134,7 +134,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; CI-IEEE-SAFE-LABEL: rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -198,39 +198,39 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) { ; GCN-DAZ-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s4 +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm ; ; GCN-IEEE-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s4 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm ; ; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-DAZ-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s2 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -245,21 +245,20 @@ define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %va ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-DAZ-SAFE-NEXT: s_endpgm ; ; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s2 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 @@ -289,15 +288,15 @@ define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %va ; ; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb -; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s2 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 @@ -367,7 +366,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-UNSAFE-NEXT: s_endpgm ; GCN-DAZ-UNSAFE-LABEL: rsqrt_fmul: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, 0 ; GCN-DAZ-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -391,7 +390,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-IEEE-UNSAFE-LABEL: rsqrt_fmul: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, 0 ; GCN-IEEE-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -415,7 +414,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-DAZ-SAFE-LABEL: rsqrt_fmul: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0 ; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -466,7 +465,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-IEEE-SAFE-LABEL: rsqrt_fmul: ; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0 ; GCN-IEEE-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -533,7 +532,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -552,7 +551,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -571,7 +570,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; GCN-DAZ-SAFE-LABEL: neg_rsq_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -606,7 +605,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; SI-IEEE-SAFE-LABEL: neg_rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -649,7 +648,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; CI-IEEE-SAFE-LABEL: neg_rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -714,7 +713,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -733,7 +732,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -752,7 +751,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -787,7 +786,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -830,7 +829,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll index 78ea3b3699f2a5..0b58b950505244 100644 --- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -10,7 +10,7 @@ ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[VRESULT]] ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) #0 { +define amdgpu_kernel void @s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) { %add = add i32 %b, 65 store i32 %add, ptr addrspace(1) %out ret void @@ -20,7 +20,7 @@ define amdgpu_kernel void @s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) #0 { ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k0_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_addk_i32_k0_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %a, i32 %b) { %add0 = add i32 %a, 65 %add1 = add i32 %b, 65 store i32 %add0, ptr addrspace(1) %out0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @s_addk_i32_k0_x2(ptr addrspace(1) %out0, ptr addrspac ; SI-LABEL: {{^}}s_addk_i32_k1: ; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}} ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k1(ptr addrspace(1) %out, i32 %b) #0 { +define amdgpu_kernel void @s_addk_i32_k1(ptr addrspace(1) %out, i32 %b) { %add = add i32 %b, 32767 ; (1 << 15) - 1 store i32 %add, ptr addrspace(1) %out ret void @@ -40,7 +40,7 @@ define amdgpu_kernel void @s_addk_i32_k1(ptr addrspace(1) %out, i32 %b) #0 { ; SI-LABEL: {{^}}s_addk_i32_k2: ; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k2(ptr addrspace(1) %out, i32 %b) #0 { +define amdgpu_kernel void @s_addk_i32_k2(ptr addrspace(1) %out, i32 %b) { %add = add i32 %b, -17 store i32 %add, ptr addrspace(1) %out ret void @@ -49,7 +49,7 @@ define amdgpu_kernel void @s_addk_i32_k2(ptr addrspace(1) %out, i32 %b) #0 { ; SI-LABEL: {{^}}s_addk_i32_k3: ; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}} ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) #0 { +define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) { %add = add i32 %b, -65 store i32 %add, ptr addrspace(1) %out ret void @@ -60,7 +60,7 @@ define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) #0 { ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 ; SI: s_endpgm ; Note: dummy argument here to prevent combining of descriptor loads for %out and %b -define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) #0 { +define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) { %add = add <2 x i32> %b, store <2 x i32> %add, ptr addrspace(1) %out ret void @@ -72,7 +72,7 @@ define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_v4i32_k0(ptr addrspace(1) %out, <4 x i32> %b) #0 { +define amdgpu_kernel void @s_addk_v4i32_k0(ptr addrspace(1) %out, <4 x i32> %b) { %add = add <4 x i32> %b, store <4 x i32> %add, ptr addrspace(1) %out ret void @@ -88,7 +88,7 @@ define amdgpu_kernel void @s_addk_v4i32_k0(ptr addrspace(1) %out, <4 x i32> %b) ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_v8i32_k0(ptr addrspace(1) %out, <8 x i32> %b) #0 { +define amdgpu_kernel void @s_addk_v8i32_k0(ptr addrspace(1) %out, <8 x i32> %b) { %add = add <8 x i32> %b, store <8 x i32> %add, ptr addrspace(1) %out ret void @@ -97,7 +97,7 @@ define amdgpu_kernel void @s_addk_v8i32_k0(ptr addrspace(1) %out, <8 x i32> %b) ; SI-LABEL: {{^}}no_s_addk_i32_k0: ; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}} ; SI: s_endpgm -define amdgpu_kernel void @no_s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) #0 { +define amdgpu_kernel void @no_s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) { %add = add i32 %b, 32768 ; 1 << 15 store i32 %add, ptr addrspace(1) %out ret void @@ -116,5 +116,5 @@ define amdgpu_kernel void @commute_s_addk_i32(ptr addrspace(1) %out, i32 %b) #0 declare i32 @llvm.amdgcn.groupstaticsize() #1 -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 35a5210d1c790b..0492c5663e6660 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -30,8 +30,8 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) { ; GCN-LABEL: v_sad_u32_constant_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x5a ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 @@ -55,8 +55,8 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -79,12 +79,12 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_sub_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s3, s0, s1 ; GCN-NEXT: s_min_u32 s0, s0, s1 @@ -93,7 +93,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 @@ -115,19 +115,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_add_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_sad_u32 v2, s0, v2, v3 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -147,19 +147,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_max_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -182,19 +182,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_min_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -218,19 +218,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_sub_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -251,12 +251,12 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_select_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 ; GCN-NEXT: s_sub_i32 s6, s1, s0 @@ -266,7 +266,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 @@ -286,9 +286,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -322,9 +322,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -356,11 +356,11 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) { ; GCN-LABEL: v_sad_u32_i16_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s4, s6, 0xffff ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 @@ -387,7 +387,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: flat_load_ushort v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: flat_load_ushort v1, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_load_ushort v2, v[0:1] glc @@ -415,8 +415,8 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) { ; GCN-LABEL: v_sad_u32_i8_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -446,7 +446,7 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: flat_load_ubyte v1, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_load_ubyte v2, v[0:1] glc @@ -474,8 +474,8 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { ; GCN-LABEL: s_sad_u32_i8_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s4, s2, 8 ; GCN-NEXT: s_and_b32 s3, s2, 0xff @@ -505,8 +505,8 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext % define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GCN-LABEL: v_sad_u32_mismatched_operands_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s6, s0, s1 ; GCN-NEXT: s_cmp_le_u32 s0, s1 @@ -534,8 +534,8 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) % define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GCN-LABEL: v_sad_u32_mismatched_operands_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s3 ; GCN-NEXT: s_sub_i32 s6, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index 684279a3776fc5..bd3c422b52efcd 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -15,8 +15,8 @@ declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: saddo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -38,8 +38,8 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: saddo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: s_add_u32 s2, s6, s0 @@ -59,20 +59,20 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX9-LABEL: saddo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_add_u32 s2, s6, s0 +; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s3, s7, s1 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -80,26 +80,26 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX10-LABEL: saddo_i64_zext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s6, s0 -; GFX10-NEXT: s_addc_u32 s3, s7, s1 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7] -; GFX10-NEXT: s_xor_b32 s0, s0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: s_add_u32 s0, s6, s2 +; GFX10-NEXT: s_addc_u32 s1, s7, s3 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7] +; GFX10-NEXT: s_xor_b32 s2, s2, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: saddo_i64_zext: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s6, s0 @@ -128,34 +128,34 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_saddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_add_i32 s14, s12, s13 -; SI-NEXT: s_cmp_lt_i32 s13, 0 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_cmp_lt_i32 s14, s12 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_add_i32 s12, s8, s9 +; SI-NEXT: s_cmp_lt_i32 s9, 0 +; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; SI-NEXT: s_cmp_lt_i32 s12, s8 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9] +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_saddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_add_i32 s4, s0, s1 @@ -175,15 +175,15 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_saddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_add_i32 s1, s0, s1 -; GFX9-NEXT: v_add_i32 v1, s0, v1 clamp -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_add_i32 s0, s2, s3 +; GFX9-NEXT: v_add_i32 v1, s2, v1 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] @@ -192,12 +192,12 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-LABEL: s_saddo_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_nc_i32 v0, s0, s1 clamp -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_add_nc_i32 v0, s2, s3 clamp +; GFX10-NEXT: s_add_i32 s0, s2, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo @@ -208,8 +208,8 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-LABEL: s_saddo_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_i32 v0, s4, s5 clamp ; GFX11-NEXT: s_add_i32 s4, s4, s5 @@ -234,7 +234,7 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -264,7 +264,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_saddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -288,7 +288,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_saddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -304,7 +304,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_saddo_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -321,7 +321,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_saddo_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -352,7 +352,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind { ; SI-LABEL: s_saddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -379,7 +379,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_saddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_saddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s8, s4, s6 @@ -420,7 +420,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: s_saddo_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s8, s4, s6 @@ -437,7 +437,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: s_saddo_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s8, s4, s6 ; GFX11-NEXT: s_addc_u32 s9, s5, s7 @@ -465,7 +465,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -496,7 +496,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_saddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -521,7 +521,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_saddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] @@ -539,7 +539,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_saddo_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -558,7 +558,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_saddo_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -592,7 +592,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -627,7 +627,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_saddo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -656,7 +656,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_saddo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[4:5] @@ -676,7 +676,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_saddo_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -697,7 +697,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_saddo_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index 1700ce302cc9db..5260a4847f70d4 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: scalar_to_vector_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: scalar_to_vector_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -53,7 +53,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: scalar_to_vector_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -73,7 +73,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: scalar_to_vector_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -219,8 +219,8 @@ bb: define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zeroext %val) nounwind { ; SI-LABEL: scalar_to_vector_test6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -230,8 +230,8 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero ; ; VI-LABEL: scalar_to_vector_test6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll index 89a09dc4fcc171..baee88b69d0602 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX900-LABEL: scalar_to_vector_v8i16: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 @@ -22,7 +22,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX906-LABEL: scalar_to_vector_v8i16: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 @@ -37,7 +37,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX908-LABEL: scalar_to_vector_v8i16: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 @@ -52,9 +52,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX90A-LABEL: scalar_to_vector_v8i16: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -86,7 +85,7 @@ entry: define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 { ; GFX900-LABEL: scalar_to_vector_v8f16: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 @@ -101,7 +100,7 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX906-LABEL: scalar_to_vector_v8f16: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 @@ -116,7 +115,7 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX908-LABEL: scalar_to_vector_v8f16: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 @@ -131,9 +130,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX90A-LABEL: scalar_to_vector_v8f16: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index ad82869c001f6f..5f291489848fe6 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -23,7 +23,7 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v23, s2, 0 -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 ; CHECK-NEXT: v_writelane_b32 v23, s3, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:7] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index b57a51f1382aec..6372d74161fad7 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s10, s2 @@ -60,7 +60,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: sdiv_i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -104,7 +104,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -199,7 +199,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -220,7 +220,7 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -241,7 +241,7 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -293,7 +293,7 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: slow_sdiv_i32_3435: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -316,7 +316,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa ; ; TONGA-LABEL: slow_sdiv_i32_3435: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: slow_sdiv_i32_3435: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -391,7 +391,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -462,7 +462,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_v2i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -533,7 +533,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -682,7 +682,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v2i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -707,7 +707,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: sdiv_v2i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -732,7 +732,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: sdiv_v2i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -791,7 +791,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_mov_b32 s6, s10 @@ -918,7 +918,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_v4i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s11, 0xf000 ; TONGA-NEXT: s_mov_b32 s10, -1 ; TONGA-NEXT: s_mov_b32 s6, s10 @@ -1045,7 +1045,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v4i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: sdiv_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: sdiv_v4i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1482,7 +1482,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; TONGA-LABEL: v_sdiv_i8: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -1515,7 +1515,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1594,7 +1594,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1637,7 +1637,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i23: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1680,7 +1680,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i23: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -1783,7 +1783,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1824,7 +1824,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i24: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1865,7 +1865,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -1962,7 +1962,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i25: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s10, s2 @@ -2009,7 +2009,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i25: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -2056,7 +2056,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i25: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -2189,7 +2189,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { ; GCN-LABEL: scalarize_mulhs_4xi32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -2221,7 +2221,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; ; TONGA-LABEL: scalarize_mulhs_4xi32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; ; GFX9-LABEL: scalarize_mulhs_4xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index f4776747f16ac1..c310e257adadc6 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -5,20 +5,20 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s8, s1, 31 -; GCN-NEXT: s_add_u32 s0, s0, s8 +; GCN-NEXT: s_ashr_i32 s8, s3, 31 +; GCN-NEXT: s_add_u32 s2, s2, s8 ; GCN-NEXT: s_mov_b32 s9, s8 -; GCN-NEXT: s_addc_u32 s1, s1, s8 -; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] +; GCN-NEXT: s_addc_u32 s3, s3, s8 +; GCN-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s4, 0, s10 ; GCN-NEXT: s_subb_u32 s5, 0, s11 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -140,8 +140,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GCN-IR-LABEL: s_test_sdiv: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 @@ -460,8 +460,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -490,8 +490,8 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -587,8 +587,8 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s8, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -628,8 +628,8 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s8, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -676,14 +676,14 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-NEXT: s_ashr_i64 s[8:9], s[2:3], 33 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -720,14 +720,14 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 33 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -771,8 +771,8 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -801,8 +801,8 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -838,14 +838,14 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-NEXT: s_ashr_i64 s[8:9], s[2:3], 39 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -882,14 +882,14 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 39 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -933,94 +933,94 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_sdiv24_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[12:13], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-NEXT: s_ashr_i64 s[0:1], s[10:11], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_xor_b32 s4, s4, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s1, s8, s2 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-NEXT: s_ashr_i64 s[10:11], s[14:15], 40 +; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v2 +; GCN-NEXT: s_or_b32 s7, s4, 1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s7, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s10 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GCN-NEXT: s_xor_b32 s4, s6, s10 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_or_b32 s2, s0, 1 +; GCN-NEXT: s_or_b32 s6, s4, 1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s2, 0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[12:13], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[10:11], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s1, s8, s2 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[14:15], 40 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-IR-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s1, v2 +; GCN-IR-NEXT: s_or_b32 s7, s4, 1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s7, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GCN-IR-NEXT: s_xor_b32 s0, s0, s10 -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GCN-IR-NEXT: s_xor_b32 s4, s6, s10 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b32 s2, s0, 1 +; GCN-IR-NEXT: s_or_b32 s6, s4, 1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-IR-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; GCN-IR-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr <2 x i64> %x, %2 = ashr <2 x i64> %y, @@ -1032,18 +1032,20 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_sdiv24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s2, s7 -; GCN-NEXT: s_sext_i32_i16 s1, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_sext_i32_i16 s5, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_sext_i32_i16 s4, s7 ; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_alignbit_b32 v2, s2, v2, 24 +; GCN-NEXT: v_alignbit_b32 v2, s4, v2, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 @@ -1055,35 +1057,33 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GCN-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[4:5], 24 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 ; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 16 -; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 ; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16 -; GCN-IR-NEXT: s_mov_b32 s1, s0 +; GCN-IR-NEXT: s_mov_b32 s3, s2 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 ; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GCN-IR-NEXT: s_mov_b32 s5, s4 -; GCN-IR-NEXT: s_sub_u32 s12, s6, s0 -; GCN-IR-NEXT: s_subb_u32 s13, s7, s0 +; GCN-IR-NEXT: s_sub_u32 s12, s6, s2 +; GCN-IR-NEXT: s_subb_u32 s13, s7, s2 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s4 @@ -1146,8 +1146,8 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end -; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3] ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 ; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_sdiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 @@ -1853,7 +1853,7 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1880,7 +1880,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_sdiv24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -1913,7 +1913,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -1939,7 +1939,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_sdiv24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 669ed915a002ae..911bb44078d510 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: add_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -22,7 +22,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: add_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -36,7 +36,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: add_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -47,7 +47,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: add_shr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -65,7 +65,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: sub_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -80,7 +80,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: sub_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -94,7 +94,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: sub_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -105,7 +105,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: sub_shr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -123,8 +123,8 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; NOSDWA-LABEL: mul_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -147,8 +147,8 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: mul_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -168,12 +168,12 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: mul_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -183,13 +183,13 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-LABEL: mul_shr_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -210,8 +210,8 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -231,8 +231,8 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX89-LABEL: mul_i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -252,12 +252,12 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX9-LABEL: mul_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2 @@ -267,13 +267,13 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; GFX10-LABEL: mul_i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 @@ -293,8 +293,8 @@ entry: define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -319,8 +319,8 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -342,12 +342,12 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -357,13 +357,13 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX10-LABEL: mul_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -383,8 +383,8 @@ entry: define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -415,8 +415,8 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v4i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -441,12 +441,12 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v4i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 @@ -457,13 +457,13 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX10-LABEL: mul_v4i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v3 @@ -484,8 +484,8 @@ entry: define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -528,8 +528,8 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v8i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -560,12 +560,12 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v8i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 @@ -578,13 +578,13 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX10-LABEL: mul_v8i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v3, v3, v7 @@ -607,8 +607,8 @@ entry: define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -625,8 +625,8 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -643,12 +643,12 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -657,13 +657,13 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] @@ -679,8 +679,8 @@ entry: define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 @@ -703,8 +703,8 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v2half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -723,12 +723,12 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v2half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -737,13 +737,13 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mul_v2half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -759,8 +759,8 @@ entry: define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -789,8 +789,8 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v4half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -812,12 +812,12 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v4half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 @@ -827,13 +827,13 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mul_v4half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 @@ -850,8 +850,8 @@ entry: define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v4, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v5, s7 @@ -892,8 +892,8 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v8half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -921,12 +921,12 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v8half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 @@ -938,13 +938,13 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mul_v8half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 @@ -963,8 +963,8 @@ entry: define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v2, s7 ; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, s6, v0 @@ -983,8 +983,8 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX89-LABEL: mul_i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v2, s7 ; GFX89-NEXT: v_add_u32_e32 v1, vcc, s6, v0 @@ -1003,11 +1003,11 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX9-LABEL: mul_i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] +; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2 @@ -1017,12 +1017,12 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; GFX10-LABEL: mul_i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX10-NEXT: global_load_ubyte v2, v0, s[0:1] +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 @@ -1042,8 +1042,8 @@ entry: define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1070,8 +1070,8 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v2i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1093,12 +1093,12 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v2i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2 @@ -1110,13 +1110,13 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_v2i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b16 v0, 8, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1142,8 +1142,8 @@ entry: define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1182,8 +1182,8 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v4i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1209,12 +1209,12 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v4i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2 @@ -1230,13 +1230,13 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_v4i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v1 @@ -1271,8 +1271,8 @@ entry: define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1330,8 +1330,8 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v8i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1364,12 +1364,12 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v8i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v1, v3 @@ -1392,13 +1392,13 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_v8i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b16 v6, 8, v0 @@ -1449,7 +1449,7 @@ entry: define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; NOSDWA-LABEL: sitofp_v2i16_to_v2f16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -1467,7 +1467,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX89-LABEL: sitofp_v2i16_to_v2f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX9-LABEL: sitofp_v2i16_to_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1496,7 +1496,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX10-LABEL: sitofp_v2i16_to_v2f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1519,8 +1519,8 @@ entry: define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mac_v2half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 @@ -1543,8 +1543,8 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mac_v2half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v2, s0 @@ -1566,12 +1566,12 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mac_v2half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX9-NEXT: v_pk_add_f16 v1, v1, v2 @@ -1581,13 +1581,13 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mac_v2half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX10-NEXT: v_pk_add_f16 v1, v1, v2 @@ -1605,7 +1605,7 @@ entry: define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: immediate_mul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -1625,7 +1625,7 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX89-LABEL: immediate_mul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX89-NEXT: v_mov_b32_e32 v3, 0x141 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) @@ -1644,7 +1644,7 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: immediate_mul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1657,7 +1657,7 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: immediate_mul_v2i16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1679,8 +1679,8 @@ entry: define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mulmul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1708,8 +1708,8 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX89-LABEL: mulmul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1733,12 +1733,12 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: mulmul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -1749,13 +1749,13 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: mulmul_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1777,8 +1777,8 @@ entry: define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: add_bb_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1802,8 +1802,8 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX89-LABEL: add_bb_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1822,12 +1822,12 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: add_bb_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1836,13 +1836,13 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: add_bb_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -1863,7 +1863,7 @@ store_label: define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrspace(1) %destValues) #0 { ; NOSDWA-LABEL: pulled_out_test: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 @@ -1900,7 +1900,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX89-LABEL: pulled_out_test: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: v_mov_b32_e32 v4, 8 ; GFX89-NEXT: v_mov_b32_e32 v5, 0xff ; GFX89-NEXT: s_waitcnt lgkmcnt(0) @@ -1929,7 +1929,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX9-LABEL: pulled_out_test: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1955,7 +1955,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX10-LABEL: pulled_out_test: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 @@ -2198,8 +2198,8 @@ bb2: define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mac_v2half_same_srcop: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 @@ -2222,8 +2222,8 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr ; ; GFX89-LABEL: mac_v2half_same_srcop: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -2245,11 +2245,11 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: mac_v2half_same_srcop: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v1 @@ -2261,12 +2261,12 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: mac_v2half_same_srcop: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll index f11e86aef683d1..d807c3909e656e 100644 --- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.sffbh.i32(i32) nounwind readnone speculatable define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GCN-LABEL: select_constant_cttz: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index cc109595d8d703..0992e9e300f136 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -6,35 +6,35 @@ define amdgpu_kernel void @select_f16( ; SI-LABEL: select_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s18, s14 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: s_mov_b32 s19, s15 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s22, s14 -; SI-NEXT: s_mov_b32 s23, s15 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -42,50 +42,50 @@ define amdgpu_kernel void @select_f16( ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s18, s14 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s6 ; VI-NEXT: s_mov_b32 s17, s7 -; VI-NEXT: s_mov_b32 s19, s15 +; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s20, s8 ; VI-NEXT: s_mov_b32 s21, s9 -; VI-NEXT: s_mov_b32 s22, s14 -; VI-NEXT: s_mov_b32 s23, s15 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_mov_b32 s23, s3 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s14 -; VI-NEXT: s_mov_b32 s11, s15 -; VI-NEXT: s_mov_b32 s2, s14 -; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: buffer_store_short v0, off, s[12:15], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: select_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-NEXT: s_mov_b32 s14, -1 ; GFX11-NEXT: s_mov_b32 s15, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s14 @@ -139,7 +139,7 @@ entry: define amdgpu_kernel void @select_f16_imm_a( ; SI-LABEL: select_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -174,7 +174,7 @@ define amdgpu_kernel void @select_f16_imm_a( ; ; VI-LABEL: select_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -205,7 +205,7 @@ define amdgpu_kernel void @select_f16_imm_a( ; ; GFX11-LABEL: select_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -252,7 +252,7 @@ entry: define amdgpu_kernel void @select_f16_imm_b( ; SI-LABEL: select_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -287,7 +287,7 @@ define amdgpu_kernel void @select_f16_imm_b( ; ; VI-LABEL: select_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -318,7 +318,7 @@ define amdgpu_kernel void @select_f16_imm_b( ; ; GFX11-LABEL: select_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -365,7 +365,7 @@ entry: define amdgpu_kernel void @select_f16_imm_c( ; SI-LABEL: select_f16_imm_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -400,7 +400,7 @@ define amdgpu_kernel void @select_f16_imm_c( ; ; VI-LABEL: select_f16_imm_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -432,7 +432,7 @@ define amdgpu_kernel void @select_f16_imm_c( ; ; GFX11-LABEL: select_f16_imm_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -479,7 +479,7 @@ entry: define amdgpu_kernel void @select_f16_imm_d( ; SI-LABEL: select_f16_imm_d: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -514,7 +514,7 @@ define amdgpu_kernel void @select_f16_imm_d( ; ; VI-LABEL: select_f16_imm_d: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -546,7 +546,7 @@ define amdgpu_kernel void @select_f16_imm_d( ; ; GFX11-LABEL: select_f16_imm_d: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -593,31 +593,31 @@ entry: define amdgpu_kernel void @select_v2f16( ; SI-LABEL: select_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s18, s14 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: s_mov_b32 s19, s15 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s22, s14 -; SI-NEXT: s_mov_b32 s23, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 ; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -642,36 +642,36 @@ define amdgpu_kernel void @select_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 -; VI-NEXT: s_mov_b32 s2, s14 -; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s6 ; VI-NEXT: s_mov_b32 s17, s7 -; VI-NEXT: s_mov_b32 s18, s14 -; VI-NEXT: s_mov_b32 s19, s15 +; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s20, s8 ; VI-NEXT: s_mov_b32 s21, s9 -; VI-NEXT: s_mov_b32 s22, s14 -; VI-NEXT: s_mov_b32 s23, s15 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_mov_b32 s23, s3 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s14 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 -; VI-NEXT: s_mov_b32 s11, s15 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -686,14 +686,14 @@ define amdgpu_kernel void @select_v2f16( ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: select_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[12:13], s[2:3], 0x44 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s2 @@ -754,7 +754,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_a( ; SI-LABEL: select_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -801,7 +801,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; ; VI-LABEL: select_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -839,7 +839,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; ; GFX11-LABEL: select_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -895,7 +895,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_b( ; SI-LABEL: select_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -942,7 +942,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; ; VI-LABEL: select_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -980,7 +980,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; ; GFX11-LABEL: select_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1036,7 +1036,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_c( ; SI-LABEL: select_v2f16_imm_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1083,7 +1083,7 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; VI-LABEL: select_v2f16_imm_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; GFX11-LABEL: select_v2f16_imm_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s10 @@ -1179,7 +1179,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_d( ; SI-LABEL: select_v2f16_imm_d: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1226,7 +1226,7 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; VI-LABEL: select_v2f16_imm_d: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; GFX11-LABEL: select_v2f16_imm_d: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s10 diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll index cc82f532fc4779..c00cd763992d97 100644 --- a/llvm/test/CodeGen/AMDGPU/setcc.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc.ll @@ -463,4 +463,4 @@ entry: ret void } -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll index 31a802b7428b95..2169ee117cbaaf 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: sext_i16_to_i32_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -25,9 +25,9 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @sext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: sext_i16_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -47,8 +47,8 @@ define amdgpu_kernel void @sext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @sext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: sext_i16_to_i32_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -68,8 +68,8 @@ define amdgpu_kernel void @sext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @sext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: sext_i16_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -89,9 +89,9 @@ define amdgpu_kernel void @sext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @sext_i32_to_i64_uniform(ptr addrspace(1) %out, i32 %a, i64 %b) { ; GCN-LABEL: sext_i32_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -111,8 +111,8 @@ define amdgpu_kernel void @sext_i32_to_i64_uniform(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @sext_i32_to_i64_divergent(ptr addrspace(1) %out, i32 %a, i64 %b) { ; GCN-LABEL: sext_i32_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index 0630cca7c099b8..b67ecc2f9d13c8 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -11,18 +11,18 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s0, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB0_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_add_i32 s7, s7, s0 +; SI-NEXT: s_add_i32 s7, s7, s2 ; SI-NEXT: s_cbranch_execnz .LBB0_3 ; SI-NEXT: .LBB0_2: ; %if ; SI-NEXT: s_sub_i32 s7, s5, s6 ; SI-NEXT: .LBB0_3: ; %endif -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_add_i32 s4, s7, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -56,23 +56,23 @@ endif: define amdgpu_kernel void @sgpr_if_else_salu_br_opt(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br_opt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s0, s[2:3], 0x2e -; SI-NEXT: s_load_dword s1, s[2:3], 0x37 +; SI-NEXT: s_load_dword s2, s[0:1], 0x2e +; SI-NEXT: s_load_dword s3, s[0:1], 0x37 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s5, s0, s1 +; SI-NEXT: s_add_i32 s5, s2, s3 ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: .LBB1_2: ; %if -; SI-NEXT: s_load_dword s0, s[2:3], 0x1c -; SI-NEXT: s_load_dword s1, s[2:3], 0x25 +; SI-NEXT: s_load_dword s2, s[0:1], 0x1c +; SI-NEXT: s_load_dword s3, s[0:1], 0x25 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s5, s0, s1 +; SI-NEXT: s_add_i32 s5, s2, s3 ; SI-NEXT: .LBB1_3: ; %endif -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_add_i32 s4, s5, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -108,28 +108,28 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_br(ptr addrspace(1) %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_valu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xc +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xc ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB2_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s8, s6, s7 ; SI-NEXT: .LBB2_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_xor_b64 exec, exec, s[0:1] +; SI-NEXT: s_xor_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB2_4 ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s4, s4, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB2_4: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -158,8 +158,8 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: sgpr_if_else_valu_cmp_phi_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll index 3d8807a88a46c1..8abd4b4302f547 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll @@ -4,7 +4,7 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3 target triple = "amdgcn-amd-amdhsa" ; CHECK-LABEL: {{^}}t0: -; CHECK: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[6:7], 0x0 +; CHECK: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] ; There should be no redundant copies from PTR_HI. ; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll index 37cf76103aa945..21fcd3cd0dcd61 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -164,4 +164,4 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.x() #0 attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index 6de015c6de79b2..bdc607552a0dfb 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -391,7 +391,8 @@ define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(ptr add } declare i32 @llvm.amdgcn.workitem.id.x() #0 + declare i32 @llvm.amdgcn.workgroup.id.x() #0 attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index ebc916b5c889b5..4b02d00ddce1ef 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -181,7 +181,7 @@ define i128 @v_ashr_i128_kv(i128 %rhs) { define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -214,7 +214,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -247,7 +247,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -430,7 +430,7 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 @@ -502,7 +502,7 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 @@ -574,7 +574,7 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 47ab5ba666877a..b3f4790df4d485 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workgroup.id.x() #0 define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -70,7 +70,7 @@ define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -92,7 +92,7 @@ define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -140,7 +140,7 @@ define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -159,7 +159,7 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: shl_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -214,40 +214,40 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @shl_i16_v_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) { ; SI-LABEL: shl_i16_v_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_v_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s12, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i16_v_s: @@ -287,42 +287,42 @@ define amdgpu_kernel void @shl_i16_v_s(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_i16_v_compute_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) { ; SI-LABEL: shl_i16_v_compute_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_v_compute_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s12, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i16_v_compute_s: @@ -370,7 +370,7 @@ define amdgpu_kernel void @shl_i16_v_compute_s(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i16_computed_amount: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -396,7 +396,7 @@ define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: shl_i16_computed_amount: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -472,8 +472,8 @@ define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { ; SI-LABEL: shl_i16_i_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -484,8 +484,8 @@ define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { ; ; VI-LABEL: shl_i16_i_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -530,7 +530,7 @@ define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -561,7 +561,7 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -630,7 +630,7 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -659,7 +659,7 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -752,7 +752,7 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -770,7 +770,7 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: shl_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -819,7 +819,7 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -839,7 +839,7 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -903,7 +903,7 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -929,7 +929,7 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; VI-NEXT: s_mov_b32 s19, 0xf000 @@ -1029,8 +1029,8 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: s_shl_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1041,8 +1041,8 @@ define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a ; ; VI-LABEL: s_shl_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1070,34 +1070,34 @@ define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a define amdgpu_kernel void @v_shl_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_shl_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_ashr_i32 s7, s6, 31 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_ashr_i32 s3, s2, 31 +; SI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_shl_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_ashr_i32 s7, s6, 31 -; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_ashr_i32 s3, s2, 31 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, s4 -; VI-NEXT: s_addc_u32 s3, s3, s5 +; VI-NEXT: s_add_u32 s2, s6, s0 +; VI-NEXT: s_addc_u32 s3, s7, s1 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_add_u32 s0, s4, s0 +; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1137,7 +1137,7 @@ define amdgpu_kernel void @v_shl_32_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { ; SI-LABEL: s_shl_constant_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s9, 0xffff ; SI-NEXT: s_mov_b32 s8, s6 @@ -1153,7 +1153,7 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: s_shl_constant_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s9, 0xffff ; VI-NEXT: s_mov_b32 s8, s6 @@ -1195,7 +1195,7 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_constant_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1215,7 +1215,7 @@ define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: v_shl_constant_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_i64_32_bit_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_shl_i64_32_bit_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -1331,7 +1331,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_inline_imm_64_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1349,7 +1349,7 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_shl_inline_imm_64_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -1394,8 +1394,8 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_64_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1407,8 +1407,8 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: s_shl_inline_imm_64_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1444,8 +1444,8 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_1_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1457,8 +1457,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: s_shl_inline_imm_1_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1495,8 +1495,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_1_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1508,8 +1508,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_1_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1542,8 +1542,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_1_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1555,8 +1555,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_1_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1589,8 +1589,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_0_5_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1602,8 +1602,8 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_0_5_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1636,8 +1636,8 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_0_5_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1649,8 +1649,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_0_5_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1683,8 +1683,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_2_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1696,8 +1696,8 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_2_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1730,8 +1730,8 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_2_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1743,8 +1743,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_2_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1777,8 +1777,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1790,8 +1790,8 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1824,8 +1824,8 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1837,8 +1837,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1874,8 +1874,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_f32_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1887,8 +1887,8 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_f32_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1926,32 +1926,32 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s4, -4.0 -; SI-NEXT: s_mov_b32 s5, -1 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s0, -4.0 +; SI-NEXT: s_mov_b32 s1, -1 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s6, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s4, -4.0 -; VI-NEXT: s_mov_b32 s5, -1 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s0, -4.0 +; VI-NEXT: s_mov_b32 s1, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: @@ -1982,32 +1982,32 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s5, 4.0 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, 4.0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s6, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s5, 4.0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s1, 4.0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64: @@ -2033,32 +2033,32 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %o define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s5, -4.0 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, -4.0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s6, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s5, -4.0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s1, -4.0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1 define amdgpu_kernel void @test_mul2(i32 %p) { ; SI-LABEL: test_mul2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2096,7 +2096,7 @@ define amdgpu_kernel void @test_mul2(i32 %p) { ; ; VI-LABEL: test_mul2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 8c663d963b73e3..b81af3eb838f1f 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; VI-LABEL: s_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -40,7 +40,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; CI-LABEL: s_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -59,7 +59,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX10-LABEL: s_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -71,7 +71,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX11-LABEL: s_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -90,7 +90,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -101,7 +101,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -120,7 +120,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -142,7 +142,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -153,9 +153,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -180,20 +178,20 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -214,21 +212,21 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, s0, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s0, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -237,10 +235,9 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: shl_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -250,12 +247,9 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: shl_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -277,20 +271,20 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -311,21 +305,21 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshl_b32_e32 v2, s0, v2 -; CI-NEXT: v_lshl_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshl_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshl_b32_e32 v3, s0, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -334,10 +328,9 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: shl_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -347,12 +340,9 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: shl_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -374,7 +364,7 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -385,7 +375,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -405,7 +395,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -426,7 +416,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -437,9 +427,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -462,7 +450,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -473,7 +461,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -493,7 +481,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -510,7 +498,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -521,9 +509,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -546,7 +532,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -558,7 +544,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_shl_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -580,7 +566,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -609,7 +595,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -621,9 +607,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_shl_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -649,7 +633,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -661,7 +645,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -685,7 +669,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -708,7 +692,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -720,9 +704,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index ddf331816694ad..c5fc51091704b5 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_x_sub_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -48,7 +48,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_x_sub_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -65,7 +65,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_x_sub_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -84,7 +84,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -95,7 +95,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i32_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -106,9 +106,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i32_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -131,7 +129,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -153,7 +151,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -175,7 +173,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +196,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -223,7 +221,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -240,7 +238,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -257,9 +255,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -291,7 +287,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_64_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -307,7 +303,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_64_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -323,7 +319,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_64_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -340,7 +336,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_64_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -359,7 +355,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_64_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -370,7 +366,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i32_64_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -381,9 +377,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i32_64_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -406,7 +400,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_65: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -422,7 +416,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_x_sub_65: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -438,7 +432,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_x_sub_65: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -455,7 +449,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_x_sub_65: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -474,7 +468,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -485,7 +479,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -496,7 +490,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -507,7 +501,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -518,9 +512,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -533,9 +525,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -558,7 +548,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_65_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -574,7 +564,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_65_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -590,7 +580,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_65_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -607,7 +597,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_65_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -626,7 +616,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_65_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -637,7 +627,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i32_65_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -648,9 +638,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i32_65_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -673,7 +661,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -689,7 +677,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_x_sub_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -705,7 +693,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_x_sub_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -722,7 +710,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_x_sub_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -741,7 +729,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -752,7 +740,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -763,7 +751,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -774,7 +762,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -785,9 +773,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -800,9 +786,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -825,7 +809,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_neg16_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -841,7 +825,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_neg16_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -857,7 +841,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_neg16_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -874,7 +858,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_neg16_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -893,7 +877,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_i32_neg16_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -904,7 +888,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: v_test_i32_neg16_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -915,9 +899,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_i32_neg16_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -940,7 +922,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_neg17: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -956,7 +938,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_x_sub_neg17: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -972,7 +954,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_x_sub_neg17: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -989,7 +971,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_x_sub_neg17: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1008,7 +990,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1019,7 +1001,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -1030,7 +1012,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1041,7 +1023,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -1052,9 +1034,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1067,9 +1047,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1092,7 +1070,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_neg17_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1108,7 +1086,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_neg17_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1124,7 +1102,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_neg17_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1141,7 +1119,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_neg17_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1160,7 +1138,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_i32_neg17_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1171,7 +1149,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: v_test_i32_neg17_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1182,9 +1160,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_i32_neg17_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1207,7 +1183,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; SI-LABEL: s_test_i32_x_sub_64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s0, s0, 64 ; SI-NEXT: ;;#ASMSTART @@ -1217,7 +1193,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; VI-LABEL: s_test_i32_x_sub_64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sub_i32 s0, s0, 64 ; VI-NEXT: ;;#ASMSTART @@ -1227,7 +1203,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX9-LABEL: s_test_i32_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_i32 s0, s0, 64 ; GFX9-NEXT: ;;#ASMSTART @@ -1237,7 +1213,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX10-LABEL: s_test_i32_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s0, s0, 64 ; GFX10-NEXT: ;;#ASMSTART @@ -1247,7 +1223,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX11-LABEL: s_test_i32_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s0, s0, 64 ; GFX11-NEXT: ;;#ASMSTART @@ -1262,7 +1238,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1278,7 +1254,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1294,7 +1270,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1311,7 +1287,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1330,7 +1306,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i16_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1341,7 +1317,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i16_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1352,9 +1328,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i16_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1377,7 +1351,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 @@ -1395,7 +1369,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1413,7 +1387,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1431,7 +1405,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1451,7 +1425,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1463,7 +1437,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1476,9 +1450,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1505,7 +1477,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1527,7 +1499,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1549,7 +1521,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1572,7 +1544,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1597,7 +1569,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -1614,7 +1586,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1631,9 +1603,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1665,7 +1635,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1684,7 +1654,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1706,7 +1676,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1726,7 +1696,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1748,7 +1718,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1759,7 +1729,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1770,9 +1740,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_test_v2i16_x_sub_64_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1795,7 +1763,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1814,7 +1782,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1836,7 +1804,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1856,7 +1824,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1878,7 +1846,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1890,7 +1858,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x400007 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1902,7 +1870,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1913,9 +1881,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_test_v2i16_x_sub_7_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1938,7 +1904,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1957,7 +1923,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1979,7 +1945,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff85 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1999,7 +1965,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7b ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2021,7 +1987,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -2033,7 +1999,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7b0040 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2045,7 +2011,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_123: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2056,9 +2022,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_sub_64_123: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2082,7 +2046,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2100,7 +2064,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2120,7 +2084,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2139,7 +2103,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2161,7 +2125,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2172,7 +2136,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2183,9 +2147,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_v2i16_x_sub_7_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2209,7 +2171,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2225,7 +2187,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2246,7 +2208,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2265,7 +2227,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2286,7 +2248,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2297,7 +2259,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2308,9 +2270,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2333,7 +2293,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2349,7 +2309,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2370,7 +2330,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2389,7 +2349,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2410,7 +2370,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -2422,7 +2382,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 35 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2434,7 +2394,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2445,9 +2405,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2470,7 +2428,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2486,7 +2444,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2507,7 +2465,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2526,7 +2484,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2547,7 +2505,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -2559,7 +2517,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 34 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2571,7 +2529,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2582,9 +2540,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2608,7 +2564,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2627,7 +2583,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2649,7 +2605,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 32 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -2669,7 +2625,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_not_b32_e32 v4, 31 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2691,7 +2647,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2702,7 +2658,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2713,9 +2669,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2738,7 +2692,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2754,7 +2708,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2775,7 +2729,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2794,7 +2748,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2815,7 +2769,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2826,7 +2780,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2837,9 +2791,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2862,7 +2814,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2880,7 +2832,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2900,7 +2852,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2919,7 +2871,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2941,7 +2893,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2952,7 +2904,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2963,9 +2915,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2989,7 +2939,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3008,7 +2958,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3030,7 +2980,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, -16 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3050,7 +3000,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, -16 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3072,7 +3022,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3083,7 +3033,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3094,9 +3044,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3119,7 +3067,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3135,7 +3083,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3156,7 +3104,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3175,7 +3123,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3196,7 +3144,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3207,7 +3155,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3218,9 +3166,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3243,7 +3189,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3261,7 +3207,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3281,7 +3227,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3300,7 +3246,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3322,7 +3268,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3333,7 +3279,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3344,9 +3290,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3369,7 +3313,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3388,7 +3332,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3410,7 +3354,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3430,7 +3374,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3452,7 +3396,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3464,7 +3408,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc400c400 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3476,7 +3420,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3487,7 +3431,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3498,9 +3442,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3513,9 +3455,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3538,7 +3478,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3557,7 +3497,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3579,7 +3519,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3599,7 +3539,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3621,7 +3561,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3633,7 +3573,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x44004400 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3645,7 +3585,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3656,7 +3596,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3667,9 +3607,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3682,9 +3620,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3707,7 +3643,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3726,7 +3662,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3748,7 +3684,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3768,7 +3704,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3790,7 +3726,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3801,7 +3737,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3812,9 +3748,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3837,7 +3771,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3856,7 +3790,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3878,7 +3812,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3898,7 +3832,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3920,7 +3854,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3931,7 +3865,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3942,9 +3876,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3967,7 +3899,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3984,7 +3916,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -4003,7 +3935,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4021,7 +3953,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4043,7 +3975,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -4054,7 +3986,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -4065,9 +3997,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4090,7 +4020,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4107,7 +4037,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -4124,7 +4054,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4141,7 +4071,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4163,7 +4093,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -4174,7 +4104,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_not_b32_e32 v2, 31 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4186,7 +4116,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -4197,7 +4127,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -4208,9 +4138,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4223,9 +4151,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll index 9f3596359a6625..1ab63762ecbd72 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -5,22 +5,22 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: break_inserted_outside_of_loop: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s0, v0 +; SI-NEXT: v_and_b32_e32 v0, s2, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: .LBB0_1: ; %ENDIF ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_and_b64 s[4:5], exec, vcc -; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SI-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execnz .LBB0_1 ; SI-NEXT: ; %bb.2: ; %ENDLOOP -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -30,22 +30,22 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; ; FLAT-LABEL: break_inserted_outside_of_loop: ; FLAT: ; %bb.0: ; %main_body -; FLAT-NEXT: s_load_dword s0, s[2:3], 0x2c +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_and_b32_e32 v0, s0, v0 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v0 ; FLAT-NEXT: v_and_b32_e32 v0, 1, v0 ; FLAT-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; FLAT-NEXT: s_mov_b64 s[0:1], 0 +; FLAT-NEXT: s_mov_b64 s[2:3], 0 ; FLAT-NEXT: .LBB0_1: ; %ENDIF ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_and_b64 s[4:5], exec, vcc -; FLAT-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; FLAT-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; FLAT-NEXT: s_andn2_b64 exec, exec, s[2:3] ; FLAT-NEXT: s_cbranch_execnz .LBB0_1 ; FLAT-NEXT: ; %bb.2: ; %ENDLOOP -; FLAT-NEXT: s_or_b64 exec, exec, s[0:1] -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_or_b64 exec, exec, s[2:3] +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: v_mov_b32_e32 v0, 0 @@ -71,23 +71,23 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SI-NEXT: s_cbranch_execz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s2, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s2, 0 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec +; SI-NEXT: s_cmp_eq_u32 s0, 0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_and_b64 s[4:5], s[0:1], exec ; SI-NEXT: .LBB1_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: .LBB1_3: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_and_b64 s[2:3], exec, s[4:5] -; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5] +; SI-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: ; %bb.4: ; %exit ; SI-NEXT: s_endpgm @@ -96,23 +96,23 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; FLAT: ; %bb.0: ; %entry ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; FLAT-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; FLAT-NEXT: s_mov_b64 s[0:1], 0 +; FLAT-NEXT: s_mov_b64 s[2:3], 0 ; FLAT-NEXT: s_mov_b64 s[4:5], 0 ; FLAT-NEXT: s_and_saveexec_b64 s[6:7], vcc ; FLAT-NEXT: s_cbranch_execz .LBB1_2 ; FLAT-NEXT: ; %bb.1: ; %else -; FLAT-NEXT: s_load_dword s2, s[2:3], 0x24 +; FLAT-NEXT: s_load_dword s0, s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_cmp_eq_u32 s2, 0 -; FLAT-NEXT: s_cselect_b64 s[2:3], -1, 0 -; FLAT-NEXT: s_and_b64 s[4:5], s[2:3], exec +; FLAT-NEXT: s_cmp_eq_u32 s0, 0 +; FLAT-NEXT: s_cselect_b64 s[0:1], -1, 0 +; FLAT-NEXT: s_and_b64 s[4:5], s[0:1], exec ; FLAT-NEXT: .LBB1_2: ; %endif ; FLAT-NEXT: s_or_b64 exec, exec, s[6:7] ; FLAT-NEXT: .LBB1_3: ; %loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLAT-NEXT: s_and_b64 s[2:3], exec, s[4:5] -; FLAT-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5] +; FLAT-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; FLAT-NEXT: s_andn2_b64 exec, exec, s[2:3] ; FLAT-NEXT: s_cbranch_execnz .LBB1_3 ; FLAT-NEXT: ; %bb.4: ; %exit ; FLAT-NEXT: s_endpgm @@ -166,12 +166,12 @@ declare float @llvm.fabs.f32(float) nounwind readnone define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { ; SI-LABEL: loop_land_info_assert: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xa +; SI-NEXT: s_load_dword s2, s[0:1], 0xa ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_i32 s0, 4 +; SI-NEXT: s_cmp_lt_i32 s2, 4 ; SI-NEXT: s_cbranch_scc1 .LBB3_4 ; SI-NEXT: ; %bb.1: ; %for.cond.preheader -; SI-NEXT: s_load_dword s0, s[2:3], 0xc +; SI-NEXT: s_load_dword s0, s[0:1], 0xc ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmpk_lt_i32 s0, 0x3e8 ; SI-NEXT: s_cbranch_scc0 .LBB3_4 @@ -186,12 +186,12 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; ; FLAT-LABEL: loop_land_info_assert: ; FLAT: ; %bb.0: ; %entry -; FLAT-NEXT: s_load_dword s0, s[2:3], 0x28 +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x28 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_cmp_lt_i32 s0, 4 +; FLAT-NEXT: s_cmp_lt_i32 s2, 4 ; FLAT-NEXT: s_cbranch_scc1 .LBB3_4 ; FLAT-NEXT: ; %bb.1: ; %for.cond.preheader -; FLAT-NEXT: s_load_dword s0, s[2:3], 0x30 +; FLAT-NEXT: s_load_dword s0, s[0:1], 0x30 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: s_cmpk_lt_i32 s0, 0x3e8 ; FLAT-NEXT: s_cbranch_scc0 .LBB3_4 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll index e64dcb74267dd9..7c5537747dd7b1 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test(i32 %arg, i32 %arg1) { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_eq_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 1d183210f95380..f9a17783f0d352 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -43,14 +43,14 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-LABEL: kernel: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x10 -; CHECK-NEXT: s_load_dword s10, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x10 +; CHECK-NEXT: s_load_dword s10, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmpk_lg_i32 s0, 0x100 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 ; CHECK-NEXT: ; %bb.1: ; %if.else ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[6:7], 0 ; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc @@ -65,7 +65,7 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_and_b64 s[2:3], s[2:3], exec ; CHECK-NEXT: .LBB0_5: ; %Flow2 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] ; CHECK-NEXT: s_cbranch_vccz .LBB0_8 ; CHECK-NEXT: s_branch .LBB0_7 ; CHECK-NEXT: .LBB0_6: @@ -77,15 +77,15 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_13 ; CHECK-NEXT: .LBB0_8: ; %Flow4 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; CHECK-NEXT: .LBB0_9: ; %UnifiedUnreachableBlock ; CHECK-NEXT: ; divergent unreachable ; CHECK-NEXT: .LBB0_10: ; %Flow6 -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; CHECK-NEXT: s_cbranch_execz .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; %if.end6.sink.split -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: v_mov_b32_e32 v1, s10 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -96,14 +96,13 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], exec ; CHECK-NEXT: s_trap 2 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; CHECK-NEXT: s_cbranch_execnz .LBB0_9 ; CHECK-NEXT: s_branch .LBB0_10 ; CHECK-NEXT: .LBB0_14: ; %cond.false.i8 ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 - entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 5536a09538e6ee..2c0f64f85d823a 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -469,4 +469,4 @@ entry: } attributes #0 = { nounwind } -attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index b54df3b4d0c6c6..9a03d216c7a99d 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,7 +19,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,13 +39,15 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: test_s_sext_i32_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 -; SI-NEXT: s_add_i32 s4, s4, s6 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mul_i32 s4, s6, s7 +; SI-NEXT: s_add_i32 s4, s4, s8 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_ashr_i32 s5, s4, 31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -54,13 +56,15 @@ define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: s_add_i32 s4, s4, s6 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mul_i32 s4, s6, s7 +; VI-NEXT: s_add_i32 s4, s4, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_ashr_i32 s5, s4, 31 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -77,7 +81,7 @@ entry: define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,7 +96,7 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,8 +117,8 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -126,8 +130,8 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun ; ; VI-LABEL: s_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -144,7 +148,7 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -162,7 +166,7 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -186,8 +190,8 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) nounwind { ; SI-LABEL: s_sext_i16_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -199,8 +203,8 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun ; ; VI-LABEL: s_sext_i16_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -217,7 +221,7 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -231,7 +235,7 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -255,8 +259,8 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { ; SI-LABEL: s_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -271,8 +275,8 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; ; VI-LABEL: s_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -295,13 +299,15 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: v_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s5, s6 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_cmp_eq_u32 s7, s8 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -310,13 +316,15 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s5, s6 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_cmp_eq_u32 s7, s8 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -342,8 +350,8 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -367,8 +375,8 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n ; ; VI-LABEL: s_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -407,7 +415,7 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -435,7 +443,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: v_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -479,7 +487,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: s_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -505,7 +513,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) ; ; VI-LABEL: s_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -544,7 +552,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -572,7 +580,7 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index e86ee1adef3d03..539cfc71a80f93 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -42,30 +42,25 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; ; GFX9-LABEL: test_simple_indirect_call: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x4 ; GFX9-NEXT: s_add_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s8, s8, 16 -; GFX9-NEXT: s_mul_i32 s8, s8, s9 -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v0 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, indirect@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, indirect@rel32@hi+12 -; GFX9-NEXT: v_mad_u32_u24 v3, v1, s9, v3 -; GFX9-NEXT: v_add_lshl_u32 v5, v3, v2, 3 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, s17 -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-NEXT: ds_write_b64 v5, v[3:4] -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, indirect@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, indirect@rel32@hi+12 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0 +; GFX9-NEXT: v_add_lshl_u32 v0, v0, v2, 3 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: ds_write_b64 v0, v[3:4] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: s_endpgm %fptr = alloca ptr, addrspace(5) %fptr.cast = addrspacecast ptr addrspace(5) %fptr to ptr diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll index ba1caf376975c5..5a241f85b2e2c8 100644 --- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -837,5 +837,5 @@ entry: ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]] ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind } -; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) } attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index b8721129222043..d1f05358ff13af 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -7,8 +7,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: sint_to_fp_i32_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -18,8 +18,8 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: sint_to_fp_i32_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -36,8 +36,8 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: sint_to_fp_i1_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -50,8 +50,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: sint_to_fp_i1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -70,8 +70,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) { ; CI-LABEL: sint_to_fp_i1_f64_load: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitcmp1_b32 s2, 0 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -84,8 +84,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; ; VI-LABEL: sint_to_fp_i1_f64_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: s_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -116,7 +116,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; ; VI-LABEL: s_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -134,7 +134,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CI-LABEL: v_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -153,7 +153,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -181,8 +181,8 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) { ; CI-LABEL: s_sint_to_fp_i8_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 @@ -193,8 +193,8 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; ; VI-LABEL: s_sint_to_fp_i8_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i32 s2, s2, 0x80000 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -230,8 +230,8 @@ define double @v_sint_to_fp_i8_to_f64(i8 %in) { define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_select_sint_to_fp_i1_vals_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -244,8 +244,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_sint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -281,8 +281,8 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_select_sint_to_fp_i1_vals_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -295,8 +295,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_sint_to_fp_i1_vals_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -351,8 +351,8 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_swap_select_sint_to_fp_i1_vals_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -365,8 +365,8 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; ; VI-LABEL: s_swap_select_sint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll index 3b35b2d3d9865f..b03726817c1b48 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_sint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -32,7 +32,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_sint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s5, s2, s3 ; GFX8-NEXT: s_flbit_i32 s4, s3 @@ -54,7 +54,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_sint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s4, s2, s3 @@ -87,7 +87,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -116,7 +116,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_sint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -146,15 +146,14 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_sint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v3, v0, v1 -; GFX11-NEXT: v_cls_i32_e32 v4, v1 +; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2 +; GFX11-NEXT: v_cls_i32_e32 v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4 @@ -162,17 +161,16 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3 ; GFX11-NEXT: v_min_u32_e32 v3, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -188,7 +186,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_sint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -211,7 +209,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_sint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s5, s2, s3 ; GFX8-NEXT: s_flbit_i32 s4, s3 @@ -232,7 +230,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_sint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s4, s2, s3 @@ -263,7 +261,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -291,7 +289,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_sint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -320,16 +318,14 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_sint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v3, v0, v1 -; GFX11-NEXT: v_cls_i32_e32 v4, v1 +; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2 +; GFX11-NEXT: v_cls_i32_e32 v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4 @@ -337,15 +333,15 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3 ; GFX11-NEXT: v_min_u32_e32 v3, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -361,8 +357,8 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -395,8 +391,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s3, s6, s7 ; GFX8-NEXT: s_flbit_i32 s2, s7 @@ -430,8 +426,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s6, s7 @@ -471,7 +467,7 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -538,7 +534,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -607,24 +603,22 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_xor_b32_e32 v9, v2, v3 -; GFX11-NEXT: v_xor_b32_e32 v11, v0, v1 +; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4 +; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v13, v6, v7 -; GFX11-NEXT: v_xor_b32_e32 v15, v4, v5 -; GFX11-NEXT: v_cls_i32_e32 v10, v3 -; GFX11-NEXT: v_cls_i32_e32 v12, v1 -; GFX11-NEXT: v_cls_i32_e32 v14, v7 -; GFX11-NEXT: v_cls_i32_e32 v16, v5 +; GFX11-NEXT: v_xor_b32_e32 v13, v7, v8 +; GFX11-NEXT: v_xor_b32_e32 v15, v5, v6 +; GFX11-NEXT: v_cls_i32_e32 v10, v4 +; GFX11-NEXT: v_cls_i32_e32 v12, v2 +; GFX11-NEXT: v_cls_i32_e32 v14, v8 +; GFX11-NEXT: v_cls_i32_e32 v16, v6 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13 @@ -644,33 +638,33 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_min_u32_e32 v11, v14, v13 ; GFX11-NEXT: v_min_u32_e32 v12, v16, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 -; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 +; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f32_i32_e32 v5, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v8 -; GFX11-NEXT: v_ldexp_f32 v3, v2, v9 -; GFX11-NEXT: v_ldexp_f32 v2, v0, v10 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 -; GFX11-NEXT: v_ldexp_f32 v0, v5, v4 -; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: v_cvt_f32_i32_e32 v6, v2 +; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 4, v0 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 +; GFX11-NEXT: v_ldexp_f32 v2, v1, v10 +; GFX11-NEXT: v_ldexp_f32 v1, v6, v11 +; GFX11-NEXT: v_ldexp_f32 v0, v4, v5 +; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -686,8 +680,8 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -724,8 +718,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s3, s6, s7 ; GFX8-NEXT: s_flbit_i32 s2, s7 @@ -762,8 +756,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s6, s7 @@ -808,7 +802,7 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -883,7 +877,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -958,24 +952,22 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_xor_b32_e32 v9, v2, v3 -; GFX11-NEXT: v_xor_b32_e32 v11, v0, v1 +; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4 +; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v13, v6, v7 -; GFX11-NEXT: v_xor_b32_e32 v15, v4, v5 -; GFX11-NEXT: v_cls_i32_e32 v10, v3 -; GFX11-NEXT: v_cls_i32_e32 v12, v1 -; GFX11-NEXT: v_cls_i32_e32 v14, v7 -; GFX11-NEXT: v_cls_i32_e32 v16, v5 +; GFX11-NEXT: v_xor_b32_e32 v13, v7, v8 +; GFX11-NEXT: v_xor_b32_e32 v15, v5, v6 +; GFX11-NEXT: v_cls_i32_e32 v10, v4 +; GFX11-NEXT: v_cls_i32_e32 v12, v2 +; GFX11-NEXT: v_cls_i32_e32 v14, v8 +; GFX11-NEXT: v_cls_i32_e32 v16, v6 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13 @@ -995,41 +987,41 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_min_u32_e32 v11, v14, v13 ; GFX11-NEXT: v_min_u32_e32 v12, v16, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 -; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v8 -; GFX11-NEXT: v_ldexp_f32 v2, v2, v9 -; GFX11-NEXT: v_ldexp_f32 v0, v0, v10 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 -; GFX11-NEXT: v_ldexp_f32 v3, v3, v4 +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v1 +; GFX11-NEXT: v_ldexp_f32 v2, v2, v11 +; GFX11-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v1, v0, v2 -; GFX11-NEXT: v_pack_b32_f16 v0, v3, v4 +; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index b08a35ab807324..b4b0d960e12e56 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; SI-LABEL: sitofp_i16_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; ; VI-LABEL: sitofp_i16_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; ; GFX11-LABEL: sitofp_i16_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @sitofp_i32_to_f16( ; SI-LABEL: sitofp_i32_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; ; VI-LABEL: sitofp_i32_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; ; GFX11-LABEL: sitofp_i32_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ entry: define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; SI-LABEL: sitofp_v2i16_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -168,7 +168,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; VI-LABEL: sitofp_v2i16_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -188,7 +188,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX11-LABEL: sitofp_v2i16_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -221,7 +221,7 @@ entry: define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; SI-LABEL: sitofp_v2i32_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -244,7 +244,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; ; VI-LABEL: sitofp_v2i32_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -266,7 +266,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; ; GFX11-LABEL: sitofp_v2i32_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -301,21 +301,19 @@ entry: define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_sint_to_fp_i1_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -323,26 +321,26 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sint_to_fp_i1_to_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -350,14 +348,16 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_sint_to_fp_i1_to_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll index fbb9ba0b73846e..233f4cc4fee501 100644 --- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll +++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll @@ -2,10 +2,10 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX940 %s -define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { +define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX940-LABEL: test: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -51,5 +51,3 @@ entry: ret void } declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32>, <4 x i32>, <4 x i32>, i32, i32 immarg, i32 immarg) - -attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 93e210bb4c8090..f8c9827ecf7a99 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -106,7 +106,7 @@ define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1) } ; GCN-LABEL: {{^}}s_abs_v4i16: -; GFX9: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x24 +; GFX9: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x24 ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[#LOAD + 2]] ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[#LOAD + 3]] ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[#LOAD + 2]], [[SUB0]] diff --git a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll index c54832d778434c..8b166b4c1bf3ff 100644 --- a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll @@ -333,7 +333,7 @@ endif: } ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16: -; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xffff8000 +; GCN: s_cmp_lt_u32 s2, 0xffff8000 define amdgpu_kernel void @br_scc_ult_i32_min_simm16(i32 %cond, ptr addrspace(1) %out) #0 { entry: %cmp0 = icmp ult i32 %cond, -32768 @@ -552,7 +552,7 @@ endif: } ; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16: -; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xfffff7ff +; GCN: s_cmp_lt_u32 s2, 0xfffff7ff define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, ptr addrspace(1) %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index 804fb8f258ffd4..c9413b61758d14 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -121,7 +121,7 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float> declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) -attributes #1 = { nounwind "amdgpu-num-vgpr"="10" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #2 = { nounwind "amdgpu-num-vgpr"="12" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #3 = { nounwind "amdgpu-num-vgpr"="32" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #4 = { nounwind "amdgpu-num-vgpr"="6" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { nounwind "amdgpu-num-vgpr"="10" } +attributes #2 = { nounwind "amdgpu-num-vgpr"="12" } +attributes #3 = { nounwind "amdgpu-num-vgpr"="32" } +attributes #4 = { nounwind "amdgpu-num-vgpr"="6" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll index 55238b284efce5..baca66a287cbf2 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @test_inst_offset_kernel() { ; MUBUF-LABEL: test_inst_offset_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -24,8 +24,8 @@ define amdgpu_kernel void @test_inst_offset_kernel() { ; ; FLATSCR-LABEL: test_inst_offset_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -61,7 +61,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_kernel() { ; MUBUF-LABEL: test_sgpr_offset_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -77,8 +77,8 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() { ; ; FLATSCR-LABEL: test_sgpr_offset_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -193,7 +193,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { ; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND @@ -215,8 +215,8 @@ define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { ; ; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s8, 0 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND @@ -275,7 +275,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { ; MUBUF-LABEL: test_sgpr_offset_subregs_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -298,8 +298,8 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { ; ; FLATSCR-LABEL: test_sgpr_offset_subregs_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -341,7 +341,7 @@ entry: define amdgpu_kernel void @test_inst_offset_subregs_kernel() { ; MUBUF-LABEL: test_inst_offset_subregs_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -365,8 +365,8 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() { ; ; FLATSCR-LABEL: test_inst_offset_subregs_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index b9ad4615fcbcf1..bea2e6d4b45a3c 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -16,7 +16,12 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_mov_b32 s42, -1 +; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX6-NEXT: s_add_u32 s40, s40, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -29,11 +34,6 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v0, vcc ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_mov_b32 s42, -1 -; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX6-NEXT: s_add_u32 s40, s40, s9 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 ; GFX6-NEXT: s_mov_b32 s2, 0x3fd00 ; GFX6-NEXT: s_mov_b64 s[8:9], 0x100 @@ -4987,7 +4987,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; ; GFX9-FLATSCR-LABEL: test: ; GFX9-FLATSCR: ; %bb.0: ; %entry -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -4999,8 +5001,6 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 -; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill @@ -7613,11 +7613,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; ; GFX10-FLATSCR-LABEL: test: ; GFX10-FLATSCR: ; %bb.0: ; %entry -; GFX10-FLATSCR-NEXT: s_add_u32 s6, s6, s11 -; GFX10-FLATSCR-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLATSCR-NEXT: s_add_u32 s2, s2, s5 +; GFX10-FLATSCR-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -10071,10 +10071,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 ; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s42, -1 ; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX6-NEXT: s_add_u32 s40, s40, s9 +; GFX6-NEXT: s_add_u32 s40, s40, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 @@ -10646,14 +10646,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-FLATSCR-LABEL: test_limited_sgpr: ; GFX9-FLATSCR: ; %bb.0: ; %entry -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 8, v0 -; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:240 -; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -10830,11 +10830,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; ; GFX10-FLATSCR-LABEL: test_limited_sgpr: ; GFX10-FLATSCR: ; %bb.0: ; %entry -; GFX10-FLATSCR-NEXT: s_add_u32 s6, s6, s11 -; GFX10-FLATSCR-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX10-FLATSCR-NEXT: s_add_u32 s2, s2, s5 +; GFX10-FLATSCR-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 ; GFX10-FLATSCR-NEXT: s_mov_b32 s33, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll index 5338bc8f7aa7ac..f5e94df415ae4f 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s -; The test was originally written to spill an SGPR to scratch without -; having spare SGPRs available to save exec. This scenario won't be -; true anymore as we reserve SGPR(s) upfront for saving exec. +; The test was originally written to spill an SGPR to scratch without having spare SGPRs +; available to save exec. This scenario won't be true anymore as we reseve SGPR(s) +; upfront for saving exec. define amdgpu_kernel void @test() #1 { ; GFX10-LABEL: test: @@ -12,7 +12,7 @@ define amdgpu_kernel void @test() #1 { ; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX10-NEXT: s_add_u32 s8, s8, s7 +; GFX10-NEXT: s_add_u32 s8, s8, s1 ; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s[0:7] @@ -37,8 +37,5 @@ define amdgpu_kernel void @test() #1 { ret void } -; FIXME: amdgpu-no attributese are a workaround for cases where the -; number of incoming arguments is larger than the number of permitted -; registers. attributes #0 = { nounwind } -attributes #1 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" "amdgpu-no-queue-ptr" "amdgpu-no-dispatch-id" } +attributes #1 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index b4a981f1db4ec7..d5f97314f9324c 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -5,17 +5,17 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-LABEL: name: test_spill_av_class ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr6_sgpr7 + ; GCN-NEXT: liveins: $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) + ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %30.sub0 - ; GCN-NEXT: SI_SPILL_V64_SAVE %30, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %24.sub0 + ; GCN-NEXT: SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %16:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]] ; GCN-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll index bc13b8d0330177..c1c69ce568a9c4 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll @@ -44,7 +44,7 @@ define void @device_writelane_intrinsic(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @kernel_writelane_intrinsic(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GCN-LABEL: kernel_writelane_intrinsic: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v1, 45 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index cd06a060a50cd8..b8cf692372069a 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -94,7 +94,7 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -147,7 +147,7 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -175,7 +175,7 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -243,7 +243,7 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -282,7 +282,7 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -395,8 +395,8 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_ashr_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -409,8 +409,8 @@ define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_ashr_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -443,7 +443,7 @@ entry: define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_i64_2: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -461,7 +461,7 @@ define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_i64_2: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -513,7 +513,7 @@ entry: define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -533,7 +533,7 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -597,7 +597,7 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -623,7 +623,7 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s10, s2 @@ -714,9 +714,9 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -730,9 +730,9 @@ define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[2:3], 0x50 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x50 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -768,7 +768,7 @@ define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_ashr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -785,7 +785,7 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_ashr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -833,9 +833,9 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_63_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -849,9 +849,9 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_63_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[2:3], 0x50 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x50 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -887,7 +887,7 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_ashr_63_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -905,7 +905,7 @@ define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_ashr_63_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index abf013e39eefa7..bcc67e974ae4a2 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i16_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[2:3] @@ -25,7 +25,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i16_7: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -49,7 +49,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i16_7: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -113,7 +113,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -149,7 +149,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TAHITI-LABEL: srem_i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -192,7 +192,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: srem_i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -277,7 +277,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v1, v0, s[2:3] @@ -292,7 +292,7 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -314,7 +314,7 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -363,7 +363,7 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v1, v0, s[2:3] @@ -381,7 +381,7 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i32_7: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -406,7 +406,7 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i32_7: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -459,7 +459,7 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -521,7 +521,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v2i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -590,7 +590,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v2i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -723,7 +723,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -747,7 +747,7 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v2i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -778,7 +778,7 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v2i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -842,7 +842,7 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] offset:16 @@ -958,7 +958,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v4i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -1081,7 +1081,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v4i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s4, s2, 16 ; TONGA-NEXT: s_addc_u32 s5, s3, 0 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -1355,7 +1355,7 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v4i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -1491,7 +1491,7 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] @@ -1675,7 +1675,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TAHITI-LABEL: srem_i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v4, 0 @@ -1836,7 +1836,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: srem_i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v4, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s6 @@ -2589,7 +2589,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -2606,7 +2606,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -2630,7 +2630,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -2684,7 +2684,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16 @@ -3039,7 +3039,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v2i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v8, 0 @@ -3346,7 +3346,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v2i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 16 @@ -4733,7 +4733,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -4757,7 +4757,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v2i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -4788,7 +4788,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v2i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -4860,7 +4860,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[10:13], v8, s[6:7] offset:32 @@ -5486,7 +5486,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v4i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v8, 0 @@ -6088,7 +6088,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v4i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 48 @@ -8883,7 +8883,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] @@ -8924,7 +8924,7 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v4i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -8972,7 +8972,7 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v4i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 8498e9af46f2b5..93fab7dff253bc 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -122,8 +122,8 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GCN-IR-LABEL: s_test_srem: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -442,8 +442,8 @@ define i64 @v_test_srem(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -477,8 +477,8 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -519,8 +519,8 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -554,8 +554,8 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -650,14 +650,14 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 -; GCN-NEXT: s_abs_i32 s8, s0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 +; GCN-NEXT: s_abs_i32 s8, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -691,14 +691,14 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 -; GCN-IR-NEXT: s_abs_i32 s8, s0 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 +; GCN-IR-NEXT: s_abs_i32 s8, s2 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -739,14 +739,14 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 -; GCN-NEXT: s_abs_i32 s8, s0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 +; GCN-NEXT: s_abs_i32 s8, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -780,14 +780,14 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 -; GCN-IR-NEXT: s_abs_i32 s8, s0 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 +; GCN-IR-NEXT: s_abs_i32 s8, s2 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -829,18 +829,18 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_abs_i32 s8, s0 +; GCN-NEXT: s_abs_i32 s8, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s0, 0, s8 +; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_abs_i32 s2, s3 @@ -868,18 +868,18 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_abs_i32 s8, s0 +; GCN-IR-NEXT: s_abs_i32 s8, s2 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_abs_i32 s2, s3 @@ -915,8 +915,8 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem33_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1049,8 +1049,8 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem33_64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 @@ -1153,8 +1153,8 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_srem24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1189,22 +1189,22 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[4:5], 24 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[4:5], 16 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[0:1], 16 -; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31 -; GCN-IR-NEXT: s_mov_b32 s1, s0 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[2:3], 16 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_mov_b32 s3, s2 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[6:7], 16 -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s4, s4, s0 -; GCN-IR-NEXT: s_subb_u32 s5, s5, s0 +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s4, s4, s2 +; GCN-IR-NEXT: s_subb_u32 s5, s5, s2 ; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] @@ -1271,20 +1271,20 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GCN-IR-NEXT: s_mul_i32 s2, s6, s11 +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GCN-IR-NEXT: s_mul_i32 s0, s6, s11 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s5 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mul_i32 s2, s7, s10 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mul_i32 s2, s6, s10 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s7, s10 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s6, s10 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 ; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v1, s0, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v0, s1, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 -; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s2, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, s3, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 ; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s14, -1 ; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v0, v2, vcc @@ -1302,7 +1302,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1411,7 +1411,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_srem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s8, s3, 31 @@ -1984,7 +1984,7 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -2016,7 +2016,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_srem24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -2054,7 +2054,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -2085,7 +2085,7 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_srem24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index 03d1dddd7b6061..418c160d4244af 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: lshr_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -64,7 +64,7 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -83,7 +83,7 @@ define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -124,7 +124,7 @@ define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -146,7 +146,7 @@ define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -194,7 +194,7 @@ define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -212,7 +212,7 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: lshr_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -258,7 +258,7 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -284,7 +284,7 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; VI-NEXT: s_mov_b32 s19, 0xf000 @@ -370,8 +370,8 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: s_lshr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -382,8 +382,8 @@ define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_lshr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x50 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x50 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -411,7 +411,7 @@ define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_lshr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -428,7 +428,7 @@ define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index 132775d81ca1ad..9ad9fa03048655 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -10,12 +10,12 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr addrspace(1) %input, ptr addrspace(1) %output, i32 %i) { ; MUBUF-LABEL: kernel_background_evaluate: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_load_dword s0, s[2:3], 0x24 +; MUBUF-NEXT: s_load_dword s0, s[0:1], 0x24 ; MUBUF-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; MUBUF-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; MUBUF-NEXT: s_mov_b32 s38, -1 ; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000 -; MUBUF-NEXT: s_add_u32 s36, s36, s9 +; MUBUF-NEXT: s_add_u32 s36, s36, s3 ; MUBUF-NEXT: s_addc_u32 s37, s37, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 @@ -48,12 +48,12 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; FLATSCR-LABEL: kernel_background_evaluate: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 s6, s6, s11 +; FLATSCR-NEXT: s_add_u32 s2, s2, s5 ; FLATSCR-NEXT: s_movk_i32 s32, 0x6000 -; FLATSCR-NEXT: s_addc_u32 s7, s7, 0 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; FLATSCR-NEXT: s_load_dword s2, s[2:3], 0x24 +; FLATSCR-NEXT: s_addc_u32 s3, s3, 0 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; FLATSCR-NEXT: s_load_dword s2, s[0:1], 0x24 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 0 @@ -81,7 +81,7 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; MUBUF11-LABEL: kernel_background_evaluate: ; MUBUF11: ; %bb.0: ; %entry -; MUBUF11-NEXT: s_load_b32 s2, s[2:3], 0x24 +; MUBUF11-NEXT: s_load_b32 s2, s[0:1], 0x24 ; MUBUF11-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 ; MUBUF11-NEXT: v_mov_b32_e32 v4, 0x400000 @@ -108,7 +108,7 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; FLATSCR11-LABEL: kernel_background_evaluate: ; FLATSCR11: ; %bb.0: ; %entry -; FLATSCR11-NEXT: s_load_b32 s2, s[2:3], 0x24 +; FLATSCR11-NEXT: s_load_b32 s2, s[0:1], 0x24 ; FLATSCR11-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 ; FLATSCR11-NEXT: v_mov_b32_e32 v4, 0x400000 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll index 6ddf0986755f95..5c6f0019f1ed93 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; VI-LABEL: max_alignment_128: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s0, s17 +; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -20,23 +20,23 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; VI-NEXT: .amdhsa_kernel max_alignment_128 ; VI-NEXT: .amdhsa_group_segment_fixed_size 0 ; VI-NEXT: .amdhsa_private_segment_fixed_size 256 -; VI-NEXT: .amdhsa_kernarg_size 56 -; VI-NEXT: .amdhsa_user_sgpr_count 14 +; VI-NEXT: .amdhsa_kernarg_size 0 +; VI-NEXT: .amdhsa_user_sgpr_count 6 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 18 +; VI-NEXT: .amdhsa_next_free_sgpr 8 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; ; GFX9-LABEL: max_alignment_128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, s17 +; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -71,23 +71,23 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; GFX9-NEXT: .amdhsa_kernel max_alignment_128 ; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 ; GFX9-NEXT: .amdhsa_private_segment_fixed_size 256 -; GFX9-NEXT: .amdhsa_kernarg_size 56 -; GFX9-NEXT: .amdhsa_user_sgpr_count 14 +; GFX9-NEXT: .amdhsa_kernarg_size 0 +; GFX9-NEXT: .amdhsa_user_sgpr_count 6 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 18 +; GFX9-NEXT: .amdhsa_next_free_sgpr 8 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 @@ -117,7 +117,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { define amdgpu_kernel void @stackrealign_attr() #1 { ; VI-LABEL: stackrealign_attr: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s0, s17 +; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -131,23 +131,23 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; VI-NEXT: .amdhsa_kernel stackrealign_attr ; VI-NEXT: .amdhsa_group_segment_fixed_size 0 ; VI-NEXT: .amdhsa_private_segment_fixed_size 12 -; VI-NEXT: .amdhsa_kernarg_size 56 -; VI-NEXT: .amdhsa_user_sgpr_count 14 +; VI-NEXT: .amdhsa_kernarg_size 0 +; VI-NEXT: .amdhsa_user_sgpr_count 6 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 18 +; VI-NEXT: .amdhsa_next_free_sgpr 8 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 @@ -168,7 +168,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; ; GFX9-LABEL: stackrealign_attr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, s17 +; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -182,23 +182,23 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; GFX9-NEXT: .amdhsa_kernel stackrealign_attr ; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 ; GFX9-NEXT: .amdhsa_private_segment_fixed_size 12 -; GFX9-NEXT: .amdhsa_kernarg_size 56 -; GFX9-NEXT: .amdhsa_user_sgpr_count 14 +; GFX9-NEXT: .amdhsa_kernarg_size 0 +; GFX9-NEXT: .amdhsa_user_sgpr_count 6 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 18 +; GFX9-NEXT: .amdhsa_next_free_sgpr 8 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 @@ -228,7 +228,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 { define amdgpu_kernel void @alignstack_attr() #2 { ; VI-LABEL: alignstack_attr: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s0, s17 +; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -242,23 +242,23 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; VI-NEXT: .amdhsa_kernel alignstack_attr ; VI-NEXT: .amdhsa_group_segment_fixed_size 0 ; VI-NEXT: .amdhsa_private_segment_fixed_size 128 -; VI-NEXT: .amdhsa_kernarg_size 56 -; VI-NEXT: .amdhsa_user_sgpr_count 14 +; VI-NEXT: .amdhsa_kernarg_size 0 +; VI-NEXT: .amdhsa_user_sgpr_count 6 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 18 +; VI-NEXT: .amdhsa_next_free_sgpr 8 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 @@ -279,7 +279,7 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; ; GFX9-LABEL: alignstack_attr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, s17 +; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -293,23 +293,23 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; GFX9-NEXT: .amdhsa_kernel alignstack_attr ; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 ; GFX9-NEXT: .amdhsa_private_segment_fixed_size 128 -; GFX9-NEXT: .amdhsa_kernarg_size 56 -; GFX9-NEXT: .amdhsa_user_sgpr_count 14 +; GFX9-NEXT: .amdhsa_kernarg_size 0 +; GFX9-NEXT: .amdhsa_user_sgpr_count 6 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 18 +; GFX9-NEXT: .amdhsa_next_free_sgpr 8 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index 3c16cd29de8f6a..c6a599094fe431 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -121,31 +121,31 @@ define amdgpu_kernel void @kernel_store_stacksave() { define amdgpu_kernel void @kernel_store_stacksave_nocall() { ; WAVE32-OPT-LABEL: kernel_store_stacksave_nocall: ; WAVE32-OPT: ; %bb.0: -; WAVE32-OPT-NEXT: s_getpc_b64 s[12:13] -; WAVE32-OPT-NEXT: s_mov_b32 s12, s0 +; WAVE32-OPT-NEXT: s_getpc_b64 s[4:5] +; WAVE32-OPT-NEXT: s_mov_b32 s4, s0 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v0, 0 -; WAVE32-OPT-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; WAVE32-OPT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-OPT-NEXT: s_bitset0_b32 s15, 21 -; WAVE32-OPT-NEXT: s_add_u32 s12, s12, s9 -; WAVE32-OPT-NEXT: s_addc_u32 s13, s13, 0 +; WAVE32-OPT-NEXT: s_bitset0_b32 s7, 21 +; WAVE32-OPT-NEXT: s_add_u32 s4, s4, s1 +; WAVE32-OPT-NEXT: s_addc_u32 s5, s5, 0 ; WAVE32-OPT-NEXT: s_lshr_b32 s0, s32, 5 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v1, s0 -; WAVE32-OPT-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; WAVE32-OPT-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; WAVE32-OPT-NEXT: s_endpgm ; ; WAVE64-OPT-LABEL: kernel_store_stacksave_nocall: ; WAVE64-OPT: ; %bb.0: -; WAVE64-OPT-NEXT: s_getpc_b64 s[12:13] -; WAVE64-OPT-NEXT: s_mov_b32 s12, s0 +; WAVE64-OPT-NEXT: s_getpc_b64 s[4:5] +; WAVE64-OPT-NEXT: s_mov_b32 s4, s0 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v0, 0 -; WAVE64-OPT-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; WAVE64-OPT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE64-OPT-NEXT: s_add_u32 s12, s12, s9 -; WAVE64-OPT-NEXT: s_addc_u32 s13, s13, 0 +; WAVE64-OPT-NEXT: s_add_u32 s4, s4, s1 +; WAVE64-OPT-NEXT: s_addc_u32 s5, s5, 0 ; WAVE64-OPT-NEXT: s_lshr_b32 s0, s32, 6 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v1, s0 -; WAVE64-OPT-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; WAVE64-OPT-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; WAVE64-OPT-NEXT: s_endpgm ; ; WAVE32-O0-LABEL: kernel_store_stacksave_nocall: @@ -803,7 +803,7 @@ define amdgpu_gfx void @func_stacksave_sgpr(ptr addrspace(5) inreg %stack) { define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; WAVE32-OPT-LABEL: kernel_stacksave_sgpr: ; WAVE32-OPT: ; %bb.0: -; WAVE32-OPT-NEXT: s_load_dword s0, s[2:3], 0x0 +; WAVE32-OPT-NEXT: s_load_dword s0, s[0:1], 0x0 ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-OPT-NEXT: ;;#ASMSTART ; WAVE32-OPT-NEXT: ; use s0 @@ -812,7 +812,7 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; ; WAVE64-OPT-LABEL: kernel_stacksave_sgpr: ; WAVE64-OPT: ; %bb.0: -; WAVE64-OPT-NEXT: s_load_dword s0, s[2:3], 0x0 +; WAVE64-OPT-NEXT: s_load_dword s0, s[0:1], 0x0 ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-OPT-NEXT: ;;#ASMSTART ; WAVE64-OPT-NEXT: ; use s0 @@ -862,72 +862,54 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-OPT-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-OPT: ; %bb.0: -; WAVE32-OPT-NEXT: s_getpc_b64 s[20:21] -; WAVE32-OPT-NEXT: s_mov_b32 s20, s0 -; WAVE32-OPT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; WAVE32-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 -; WAVE32-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; WAVE32-OPT-NEXT: s_getpc_b64 s[8:9] +; WAVE32-OPT-NEXT: s_mov_b32 s8, s0 ; WAVE32-OPT-NEXT: s_movk_i32 s32, 0x1200 -; WAVE32-OPT-NEXT: s_mov_b64 s[10:11], s[4:5] -; WAVE32-OPT-NEXT: s_mov_b32 s4, s32 -; WAVE32-OPT-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-OPT-NEXT: v_mov_b32_e32 v4, 17 -; WAVE32-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 -; WAVE32-OPT-NEXT: s_mov_b32 s14, s8 -; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi -; WAVE32-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo -; WAVE32-OPT-NEXT: s_mov_b32 s12, s6 -; WAVE32-OPT-NEXT: s_mov_b32 s13, s7 +; WAVE32-OPT-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; WAVE32-OPT-NEXT: s_mov_b32 s0, s32 +; WAVE32-OPT-NEXT: v_mov_b32_e32 v0, 42 +; WAVE32-OPT-NEXT: v_mov_b32_e32 v1, 17 +; WAVE32-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi +; WAVE32-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-OPT-NEXT: s_bitset0_b32 s23, 21 -; WAVE32-OPT-NEXT: s_add_u32 s20, s20, s9 -; WAVE32-OPT-NEXT: s_addc_u32 s21, s21, 0 -; WAVE32-OPT-NEXT: s_lshr_b32 s15, s4, 5 -; WAVE32-OPT-NEXT: s_mov_b64 s[4:5], s[0:1] -; WAVE32-OPT-NEXT: s_mov_b64 s[8:9], s[2:3] -; WAVE32-OPT-NEXT: s_mov_b64 s[0:1], s[20:21] -; WAVE32-OPT-NEXT: s_mov_b64 s[2:3], s[22:23] -; WAVE32-OPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 +; WAVE32-OPT-NEXT: s_bitset0_b32 s11, 21 +; WAVE32-OPT-NEXT: s_add_u32 s8, s8, s1 +; WAVE32-OPT-NEXT: s_addc_u32 s9, s9, 0 +; WAVE32-OPT-NEXT: s_lshr_b32 s6, s0, 5 +; WAVE32-OPT-NEXT: s_mov_b64 s[0:1], s[8:9] +; WAVE32-OPT-NEXT: s_mov_b64 s[2:3], s[10:11] +; WAVE32-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; WAVE32-OPT-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE32-OPT-NEXT: buffer_store_dword v4, off, s[20:23], s32 offset:4 -; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[8:11], s32 offset:4 +; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] ; WAVE32-OPT-NEXT: ;;#ASMSTART -; WAVE32-OPT-NEXT: ; use s15 +; WAVE32-OPT-NEXT: ; use s6 ; WAVE32-OPT-NEXT: ;;#ASMEND ; WAVE32-OPT-NEXT: s_endpgm ; ; WAVE64-OPT-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE64-OPT: ; %bb.0: -; WAVE64-OPT-NEXT: s_getpc_b64 s[20:21] -; WAVE64-OPT-NEXT: s_mov_b32 s20, s0 -; WAVE64-OPT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; WAVE64-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 -; WAVE64-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; WAVE64-OPT-NEXT: s_getpc_b64 s[8:9] +; WAVE64-OPT-NEXT: s_mov_b32 s8, s0 ; WAVE64-OPT-NEXT: s_movk_i32 s32, 0x2400 -; WAVE64-OPT-NEXT: s_mov_b64 s[10:11], s[4:5] -; WAVE64-OPT-NEXT: s_mov_b32 s4, s32 -; WAVE64-OPT-NEXT: v_mov_b32_e32 v3, 42 -; WAVE64-OPT-NEXT: v_mov_b32_e32 v4, 17 -; WAVE64-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 -; WAVE64-OPT-NEXT: s_mov_b32 s14, s8 -; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi -; WAVE64-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo -; WAVE64-OPT-NEXT: s_mov_b32 s12, s6 -; WAVE64-OPT-NEXT: s_mov_b32 s13, s7 +; WAVE64-OPT-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; WAVE64-OPT-NEXT: s_mov_b32 s0, s32 +; WAVE64-OPT-NEXT: v_mov_b32_e32 v0, 42 +; WAVE64-OPT-NEXT: v_mov_b32_e32 v1, 17 +; WAVE64-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi +; WAVE64-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE64-OPT-NEXT: s_add_u32 s20, s20, s9 -; WAVE64-OPT-NEXT: s_addc_u32 s21, s21, 0 -; WAVE64-OPT-NEXT: s_lshr_b32 s15, s4, 6 -; WAVE64-OPT-NEXT: s_mov_b64 s[4:5], s[0:1] -; WAVE64-OPT-NEXT: s_mov_b64 s[8:9], s[2:3] -; WAVE64-OPT-NEXT: s_mov_b64 s[0:1], s[20:21] -; WAVE64-OPT-NEXT: s_mov_b64 s[2:3], s[22:23] -; WAVE64-OPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 +; WAVE64-OPT-NEXT: s_add_u32 s8, s8, s1 +; WAVE64-OPT-NEXT: s_addc_u32 s9, s9, 0 +; WAVE64-OPT-NEXT: s_lshr_b32 s6, s0, 6 +; WAVE64-OPT-NEXT: s_mov_b64 s[0:1], s[8:9] +; WAVE64-OPT-NEXT: s_mov_b64 s[2:3], s[10:11] +; WAVE64-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; WAVE64-OPT-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE64-OPT-NEXT: buffer_store_dword v4, off, s[20:23], s32 offset:4 -; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[8:11], s32 offset:4 +; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] ; WAVE64-OPT-NEXT: ;;#ASMSTART -; WAVE64-OPT-NEXT: ; use s15 +; WAVE64-OPT-NEXT: ; use s6 ; WAVE64-OPT-NEXT: ;;#ASMEND ; WAVE64-OPT-NEXT: s_endpgm ; @@ -1292,70 +1274,70 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-OPT-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-OPT: ; %bb.0: ; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-OPT-NEXT: s_mov_b32 s20, s33 +; WAVE32-OPT-NEXT: s_mov_b32 s8, s33 ; WAVE32-OPT-NEXT: s_mov_b32 s33, s32 -; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s16, -1 -; WAVE32-OPT-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s16 -; WAVE32-OPT-NEXT: v_writelane_b32 v32, s30, 0 +; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s4, -1 +; WAVE32-OPT-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-OPT-NEXT: v_writelane_b32 v31, s30, 0 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v0, 42 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v1, 17 ; WAVE32-OPT-NEXT: s_addk_i32 s32, 0x1200 -; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi -; WAVE32-OPT-NEXT: s_mov_b32 s18, s32 -; WAVE32-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo -; WAVE32-OPT-NEXT: v_writelane_b32 v32, s31, 1 -; WAVE32-OPT-NEXT: s_lshr_b32 s19, s18, 5 +; WAVE32-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi +; WAVE32-OPT-NEXT: s_mov_b32 s6, s32 +; WAVE32-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo +; WAVE32-OPT-NEXT: v_writelane_b32 v31, s31, 1 +; WAVE32-OPT-NEXT: s_lshr_b32 s7, s6, 5 ; WAVE32-OPT-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE32-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] ; WAVE32-OPT-NEXT: ;;#ASMSTART -; WAVE32-OPT-NEXT: ; use s19 +; WAVE32-OPT-NEXT: ; use s7 ; WAVE32-OPT-NEXT: ;;#ASMEND -; WAVE32-OPT-NEXT: s_mov_b32 s32, s18 -; WAVE32-OPT-NEXT: v_readlane_b32 s31, v32, 1 -; WAVE32-OPT-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE32-OPT-NEXT: s_mov_b32 s32, s6 +; WAVE32-OPT-NEXT: v_readlane_b32 s31, v31, 1 +; WAVE32-OPT-NEXT: v_readlane_b32 s30, v31, 0 ; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s4, -1 -; WAVE32-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; WAVE32-OPT-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-OPT-NEXT: s_addk_i32 s32, 0xee00 -; WAVE32-OPT-NEXT: s_mov_b32 s33, s20 +; WAVE32-OPT-NEXT: s_mov_b32 s33, s8 ; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) ; WAVE32-OPT-NEXT: s_setpc_b64 s[30:31] ; ; WAVE64-OPT-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE64-OPT: ; %bb.0: ; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE64-OPT-NEXT: s_mov_b32 s20, s33 +; WAVE64-OPT-NEXT: s_mov_b32 s8, s33 ; WAVE64-OPT-NEXT: s_mov_b32 s33, s32 -; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; WAVE64-OPT-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; WAVE64-OPT-NEXT: s_mov_b64 exec, s[16:17] -; WAVE64-OPT-NEXT: v_writelane_b32 v32, s30, 0 +; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; WAVE64-OPT-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE64-OPT-NEXT: s_mov_b64 exec, s[4:5] +; WAVE64-OPT-NEXT: v_writelane_b32 v31, s30, 0 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v0, 42 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v1, 17 ; WAVE64-OPT-NEXT: s_addk_i32 s32, 0x2400 -; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi -; WAVE64-OPT-NEXT: s_mov_b32 s18, s32 -; WAVE64-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo -; WAVE64-OPT-NEXT: v_writelane_b32 v32, s31, 1 -; WAVE64-OPT-NEXT: s_lshr_b32 s19, s18, 6 +; WAVE64-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi +; WAVE64-OPT-NEXT: s_mov_b32 s6, s32 +; WAVE64-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo +; WAVE64-OPT-NEXT: v_writelane_b32 v31, s31, 1 +; WAVE64-OPT-NEXT: s_lshr_b32 s7, s6, 6 ; WAVE64-OPT-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE64-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] ; WAVE64-OPT-NEXT: ;;#ASMSTART -; WAVE64-OPT-NEXT: ; use s19 +; WAVE64-OPT-NEXT: ; use s7 ; WAVE64-OPT-NEXT: ;;#ASMEND -; WAVE64-OPT-NEXT: s_mov_b32 s32, s18 -; WAVE64-OPT-NEXT: v_readlane_b32 s31, v32, 1 -; WAVE64-OPT-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE64-OPT-NEXT: s_mov_b32 s32, s6 +; WAVE64-OPT-NEXT: v_readlane_b32 s31, v31, 1 +; WAVE64-OPT-NEXT: v_readlane_b32 s30, v31, 0 ; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; WAVE64-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; WAVE64-OPT-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE64-OPT-NEXT: s_mov_b64 exec, s[4:5] ; WAVE64-OPT-NEXT: s_addk_i32 s32, 0xdc00 -; WAVE64-OPT-NEXT: s_mov_b32 s33, s20 +; WAVE64-OPT-NEXT: s_mov_b32 s33, s8 ; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) ; WAVE64-OPT-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index f7eb760fda084f..01ad9665971394 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -8,10 +8,10 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -21,8 +21,8 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; ; GFX7-LABEL: store_lds_v4i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s4 @@ -35,8 +35,8 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; ; GFX6-LABEL: store_lds_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 @@ -50,10 +50,10 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; GFX10-LABEL: store_lds_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 @@ -64,8 +64,8 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; GFX11-LABEL: store_lds_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 @@ -79,10 +79,10 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 @@ -123,8 +123,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -176,8 +176,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -230,10 +230,10 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: s_lshr_b32 s3, s6, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 @@ -275,8 +275,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_lshr_b32 s4, s3, 8 @@ -317,10 +317,10 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 @@ -337,8 +337,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -366,8 +366,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -396,10 +396,10 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 @@ -417,8 +417,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -439,10 +439,10 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 @@ -453,8 +453,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -468,8 +468,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -484,10 +484,10 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 @@ -499,8 +499,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 @@ -515,10 +515,10 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -528,8 +528,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s4 @@ -542,8 +542,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -557,12 +557,12 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 @@ -571,8 +571,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -587,10 +587,10 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -600,8 +600,8 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; ; GFX7-LABEL: store_lds_v4i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s4 @@ -614,8 +614,8 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; ; GFX6-LABEL: store_lds_v4i32_align16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 @@ -629,10 +629,10 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; GFX10-LABEL: store_lds_v4i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 @@ -643,8 +643,8 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; GFX11-LABEL: store_lds_v4i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll index 64ce67a1a3deeb..507b411996d973 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -8,20 +8,20 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -33,8 +33,8 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) ; ; GFX6-LABEL: store_lds_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -48,21 +48,21 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) ; GFX10-LABEL: store_lds_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 @@ -75,10 +75,10 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 @@ -110,8 +110,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -152,8 +152,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -195,10 +195,10 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 @@ -231,8 +231,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 @@ -265,10 +265,10 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 @@ -282,8 +282,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -306,8 +306,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -331,10 +331,10 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 @@ -349,8 +349,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -368,10 +368,10 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 @@ -381,8 +381,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -395,8 +395,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -410,10 +410,10 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 @@ -424,8 +424,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 @@ -439,10 +439,10 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -452,8 +452,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 @@ -466,8 +466,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -481,10 +481,10 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -495,8 +495,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -510,20 +510,20 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -535,8 +535,8 @@ define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i ; ; GFX6-LABEL: store_lds_v3i32_align16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -550,21 +550,21 @@ define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i ; GFX10-LABEL: store_lds_v3i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 3644bef9c20a1f..f88aaf389ca9ae 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -50,12 +50,12 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s6, 14 +; HAWAII-NEXT: s_or_b32 s0, s4, 14 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s7 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v1, s2 @@ -70,12 +70,12 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s6, 14 +; FIJI-NEXT: s_or_b32 s0, s4, 14 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s7 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s3, s1, 0xffff @@ -94,9 +94,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -114,9 +114,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s3, s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, s2 @@ -133,16 +133,16 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX11-LABEL: local_store_i55: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[2:3] offset:14 +; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[0:1] offset:14 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s3, s1, 0xffff -; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_mov_b32_e32 v3, s0 +; GFX11-NEXT: s_and_b32 s1, s3, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s3 +; GFX11-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 ; GFX11-NEXT: ds_store_b8_d16_hi v1, v0 offset:6 @@ -156,8 +156,8 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v0, s2 @@ -169,8 +169,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, s2 @@ -182,8 +182,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -195,8 +195,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX10-LABEL: local_store_i48: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -208,10 +208,10 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX11-LABEL: local_store_i48: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:4 ; GFX11-NEXT: ds_store_b32 v0, v2 @@ -223,9 +223,9 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x4 -; HAWAII-NEXT: s_load_dword s3, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x4 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: s_and_b32 s2, s2, 1 @@ -239,9 +239,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x10 -; FIJI-NEXT: s_load_dword s3, s[6:7], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x10 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s2, s2, 1 @@ -255,9 +255,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x10 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 @@ -271,9 +271,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX10-LABEL: local_store_i65: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x10 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 @@ -287,13 +287,13 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX11-LABEL: local_store_i65: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 1 +; GFX11-NEXT: s_and_b32 s2, s2, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: ds_store_b8 v2, v3 offset:8 ; GFX11-NEXT: ds_store_b64 v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 60448735632548..ded308ae4f2307 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: s_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX8-LABEL: s_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX9-LABEL: s_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_i32 s2, s2, s3 @@ -43,7 +43,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX12-LABEL: s_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, s2, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -60,8 +60,8 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: s_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -72,10 +72,10 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX8-LABEL: s_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s4 +; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -84,18 +84,18 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX9-LABEL: s_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, 0x4d2, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_sub_i32 s0, 0x4d2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -112,7 +112,7 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -130,7 +130,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -144,7 +144,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -155,7 +155,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: test_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -194,7 +194,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: test_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -208,7 +208,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: test_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -219,7 +219,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: test_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -238,7 +238,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -257,7 +257,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -272,7 +272,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -284,7 +284,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -306,7 +306,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -328,7 +328,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -350,7 +350,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 @@ -365,7 +365,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 @@ -391,7 +391,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -412,7 +412,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -432,7 +432,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -446,11 +446,9 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: test_sub_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -474,7 +472,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -499,7 +497,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -517,7 +515,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -529,11 +527,9 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -555,7 +551,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -587,7 +583,7 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -608,7 +604,7 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -621,11 +617,9 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -648,8 +642,8 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { ; GFX6-LABEL: s_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -662,8 +656,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX8-LABEL: s_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_u32 s2, s4, s6 ; GFX8-NEXT: s_subb_u32 s3, s5, s7 @@ -676,22 +670,22 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX9-LABEL: s_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s2, s4, s6 -; GFX9-NEXT: s_subb_u32 s3, s5, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_sub_u32 s0, s4, s6 +; GFX9-NEXT: s_subb_u32 s1, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[4:5], s[6:7] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -708,8 +702,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind { ; GFX6-LABEL: v_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -731,8 +725,8 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX8-LABEL: v_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -753,12 +747,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX9-LABEL: v_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 @@ -769,12 +763,10 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; GFX12-LABEL: v_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7] @@ -799,8 +791,8 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -824,8 +816,8 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -848,12 +840,12 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 @@ -866,12 +858,10 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-LABEL: v_test_sub_v2i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7] @@ -898,8 +888,8 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -931,8 +921,8 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -971,14 +961,14 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v4i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:16 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 @@ -997,12 +987,10 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-LABEL: v_test_sub_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v16, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 +; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index fe234a82ba6f7f..6ec213a06999b6 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -8,13 +8,13 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -24,8 +24,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -49,13 +49,13 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: v_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -67,10 +67,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: v_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -98,25 +96,25 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; GFX9-LABEL: s_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_pk_sub_i16 v0, s9, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -139,23 +137,23 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: s_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -177,7 +175,7 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; GCN-LABEL: s_test_sub_self_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -187,7 +185,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX10-LABEL: s_test_sub_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 @@ -197,7 +195,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: s_test_sub_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -216,7 +214,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; GFX9-LABEL: s_test_sub_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -229,7 +227,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: s_test_sub_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -248,7 +246,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -260,7 +258,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX11-LABEL: s_test_sub_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -279,7 +277,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -293,7 +291,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_test_sub_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -312,7 +310,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_sub_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -326,9 +324,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_sub_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -353,7 +349,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -367,7 +363,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -386,7 +382,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_sub_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -400,9 +396,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_sub_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -426,7 +420,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -439,7 +433,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; VI-LABEL: v_test_sub_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -458,7 +452,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -472,9 +466,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -498,7 +490,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -511,7 +503,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -529,7 +521,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -543,9 +535,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -570,7 +560,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -583,7 +573,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; VI-LABEL: v_test_sub_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -601,7 +591,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -615,9 +605,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -642,13 +630,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -660,8 +648,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -684,13 +672,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -704,10 +692,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -740,14 +726,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -760,8 +746,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -786,13 +772,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -808,10 +794,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -845,13 +829,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -863,8 +847,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -889,13 +873,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -909,10 +893,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -945,12 +927,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -965,8 +947,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -992,13 +974,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 @@ -1015,10 +997,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index d4329aec2021c0..873567c3ab6f4c 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -7,7 +7,7 @@ target triple="amdgcn--" ; NOTE: breaking large PHIs is disabled here else this example is completely optimized out ; before reaching codegen. -define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) nounwind { ; CHECK-LABEL: foobar: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 @@ -59,4 +59,3 @@ ife: declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll index 19d633651fdd0d..1be420eccb353f 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll @@ -4,7 +4,7 @@ ; TODO: Update to check for granulated sgpr count directive once one is added. -define amdgpu_kernel void @kern() #0 { +define amdgpu_kernel void @kern() { ; ASM-LABEL: kern: ; ASM: .amdhsa_next_free_sgpr 5 ; ASM: .amdhsa_reserve_xnack_mask 1 @@ -23,7 +23,5 @@ entry: ret void } -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll index 2097579e0c9959..acdcd16a1f9efe 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll @@ -4,7 +4,7 @@ ; TODO: Update to check for granulated sgpr count directive once one is added. -define amdgpu_kernel void @kern() #0 { +define amdgpu_kernel void @kern() { ; ASM-LABEL: kern: ; ASM: .amdhsa_next_free_sgpr 5 ; ASM: .amdhsa_reserve_xnack_mask 0 @@ -23,7 +23,5 @@ entry: ret void } -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll index 775c62e73261a9..0aac07342db849 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll @@ -4,7 +4,7 @@ ; TODO: Update to check for granulated sgpr count directive once one is added. -define amdgpu_kernel void @kern() #0 { +define amdgpu_kernel void @kern() { ; ASM-LABEL: kern: ; ASM: .amdhsa_next_free_sgpr 5 ; ASM: .amdhsa_reserve_xnack_mask 1 @@ -23,7 +23,5 @@ entry: ret void } -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 52370f6a2ef054..7dce633e9186ae 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -12,7 +12,7 @@ declare void @llvm.debugtrap() #1 define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-LABEL: trap: ; NOHSA-TRAP-GFX900: ; %bb.0: -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -22,9 +22,9 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-TRAP-GFX803-LABEL: trap: ; HSA-TRAP-GFX803: ; %bb.0: -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3 @@ -34,7 +34,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-TRAP-GFX900-LABEL: trap: ; HSA-TRAP-GFX900: ; %bb.0: -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -44,7 +44,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-NOTRAP-GFX900-LABEL: trap: ; HSA-NOTRAP-GFX900: ; %bb.0: -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -54,7 +54,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-TRAP-GFX1100-LABEL: trap: ; HSA-TRAP-GFX1100: ; %bb.0: -; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 ; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) @@ -103,7 +103,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { ; NOHSA-TRAP-GFX900-LABEL: non_entry_trap: ; NOHSA-TRAP-GFX900: ; %bb.0: ; %entry -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -120,7 +120,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; ; HSA-TRAP-GFX803-LABEL: non_entry_trap: ; HSA-TRAP-GFX803: ; %bb.0: ; %entry -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -136,12 +136,12 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) ; HSA-TRAP-GFX803-NEXT: s_endpgm ; HSA-TRAP-GFX803-NEXT: .LBB1_2: ; %trap -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] ; HSA-TRAP-GFX803-NEXT: s_trap 2 ; ; HSA-TRAP-GFX900-LABEL: non_entry_trap: ; HSA-TRAP-GFX900: ; %bb.0: ; %entry -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -158,7 +158,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; ; HSA-NOTRAP-GFX900-LABEL: non_entry_trap: ; HSA-NOTRAP-GFX900: ; %bb.0: ; %entry -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NOTRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -175,7 +175,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; ; HSA-TRAP-GFX1100-LABEL: non_entry_trap: ; HSA-TRAP-GFX1100: ; %bb.0: ; %entry -; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -267,7 +267,7 @@ ret: define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { ; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after: ; NOHSA-TRAP-GFX900: ; %bb.0: -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -281,8 +281,8 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-TRAP-GFX803-LABEL: trap_with_use_after: ; HSA-TRAP-GFX803: ; %bb.0: -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] -; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5 @@ -297,7 +297,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-TRAP-GFX900-LABEL: trap_with_use_after: ; HSA-TRAP-GFX900: ; %bb.0: -; HSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -309,7 +309,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-NOTRAP-GFX900-LABEL: trap_with_use_after: ; HSA-NOTRAP-GFX900: ; %bb.0: -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NOTRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -323,7 +323,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-TRAP-GFX1100-LABEL: trap_with_use_after: ; HSA-TRAP-GFX1100: ; %bb.0: -; HSA-TRAP-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -403,7 +403,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-LABEL: debugtrap: ; NOHSA-TRAP-GFX900: ; %bb.0: -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 @@ -416,7 +416,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-TRAP-GFX803-LABEL: debugtrap: ; HSA-TRAP-GFX803: ; %bb.0: -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) @@ -431,7 +431,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-TRAP-GFX900-LABEL: debugtrap: ; HSA-TRAP-GFX900: ; %bb.0: -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 @@ -445,7 +445,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-NOTRAP-GFX900-LABEL: debugtrap: ; HSA-NOTRAP-GFX900: ; %bb.0: -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 @@ -458,7 +458,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-TRAP-GFX1100-LABEL: debugtrap: ; HSA-TRAP-GFX1100: ; %bb.0: -; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 ; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v2, 2 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll index 9bab3e6fcf8c45..2f687295af73e0 100644 --- a/llvm/test/CodeGen/AMDGPU/trap.ll +++ b/llvm/test/CodeGen/AMDGPU/trap.ll @@ -31,14 +31,14 @@ declare void @llvm.debugtrap() #1 ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 5080 +; MESA-TRAP-NEXT: .long 208 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 5016 +; NOMESA-TRAP-NEXT: .long 144 ; GCN-LABEL: {{^}}hsa_trap: -; HSA-TRAP: s_mov_b64 s[0:1], s[6:7] +; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] ; HSA-TRAP: s_trap 2 ; HSA-TRAP: COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 @@ -59,11 +59,11 @@ define amdgpu_kernel void @hsa_trap(ptr addrspace(1) nocapture readonly %arg0) { ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 5080 +; MESA-TRAP-NEXT: .long 204 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 5016 +; NOMESA-TRAP-NEXT: .long 140 ; GCN-LABEL: {{^}}hsa_debugtrap: ; HSA-TRAP: s_trap 3 @@ -102,7 +102,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; NO-TRAP-BIT: enable_trap_handler = 0 ; HSA-TRAP: BB{{[0-9]_[0-9]+}}: ; %trap -; HSA-TRAP: s_mov_b64 s[0:1], s[6:7] +; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] ; HSA-TRAP-NEXT: s_trap 2 define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { entry: @@ -124,7 +124,7 @@ ret: ; NO-TRAP-BIT: enable_trap_handler = 0 ; HSA-TRAP: BB{{[0-9]_[0-9]+}}: ; %trap -; HSA-TRAP: s_mov_b64 s[0:1], s[6:7] +; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] ; HSA-TRAP-NEXT: s_trap 2 define amdgpu_kernel void @non_entry_trap_no_unreachable(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { entry: diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index 22eb7dddb84f4d..c0c56ebb166108 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -85,8 +85,8 @@ define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) { define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture readonly %arg1, ptr addrspace(1) nocapture %arg2) local_unnamed_addr { ; SI-LABEL: truncate_high_elt_extract_vector: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -103,8 +103,8 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc ; ; VI-LABEL: truncate_high_elt_extract_vector: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[4:5], 0x0 ; VI-NEXT: s_load_dword s3, s[6:7], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll index efb1a630f927ca..931953e230bb2e 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll @@ -5,58 +5,58 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, <16 x i32> %in) { ; SI-LABEL: truncstore_arg_v16i32_to_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s1, s18, 0xff -; SI-NEXT: s_lshl_b32 s0, s19, 24 -; SI-NEXT: s_lshl_b32 s1, s1, 16 -; SI-NEXT: s_or_b32 s0, s0, s1 -; SI-NEXT: s_lshl_b32 s1, s17, 8 -; SI-NEXT: s_and_b32 s2, s16, 0xff -; SI-NEXT: s_or_b32 s1, s2, s1 -; SI-NEXT: s_and_b32 s1, s1, 0xffff -; SI-NEXT: s_and_b32 s2, s14, 0xff -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: s_lshl_b32 s1, s15, 24 -; SI-NEXT: s_lshl_b32 s2, s2, 16 -; SI-NEXT: s_or_b32 s1, s1, s2 -; SI-NEXT: s_lshl_b32 s2, s13, 8 -; SI-NEXT: s_and_b32 s3, s12, 0xff -; SI-NEXT: s_or_b32 s2, s3, s2 -; SI-NEXT: s_and_b32 s2, s2, 0xffff -; SI-NEXT: s_and_b32 s3, s10, 0xff -; SI-NEXT: s_or_b32 s1, s2, s1 -; SI-NEXT: s_lshl_b32 s2, s11, 24 -; SI-NEXT: s_lshl_b32 s3, s3, 16 -; SI-NEXT: s_or_b32 s2, s2, s3 -; SI-NEXT: s_lshl_b32 s3, s9, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 8 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_or_b32 s3, s8, s3 -; SI-NEXT: s_and_b32 s3, s3, 0xffff ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_or_b32 s2, s3, s2 -; SI-NEXT: s_lshl_b32 s3, s7, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_lshl_b32 s15, s15, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_lshl_b32 s11, s11, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_or_b32 s3, s3, s6 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s3, s4, s3 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_mov_b32_e32 v3, s0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: truncstore_arg_v16i32_to_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s17 ; VI-NEXT: v_mov_b32_e32 v1, s16 @@ -98,9 +98,9 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, <16 x i64> %in) { ; SI-LABEL: truncstore_arg_v16i64_to_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x39 -; SI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x29 +; SI-NEXT: s_load_dwordx16 s[16:31], s[0:1], 0x39 +; SI-NEXT: s_load_dwordx2 s[36:37], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x29 ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s38, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,9 +149,9 @@ define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, ; ; VI-LABEL: truncstore_arg_v16i64_to_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0xe4 -; VI-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0xa4 +; VI-NEXT: s_load_dwordx16 s[16:31], s[0:1], 0xe4 +; VI-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s26 ; VI-NEXT: v_mov_b32_e32 v1, s24 diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll index 88bdf6454fe522..a9cd0e997e0e59 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @trunc_i64_to_i32_store(ptr addrspace(1) %out, [8 x i32], i64 %in) { ; GCN-LABEL: {{^}}trunc_i64_to_i32_store: -; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[2:3], +; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] ; SI: buffer_store_dword [[VLOAD]] ; VI: flat_store_dword v[{{[0-9:]+}}], [[VLOAD]] diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index 03a1b3598024b4..416dbb226422cc 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_uaddo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -27,8 +27,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; VI-LABEL: s_uaddo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s6, s0 @@ -46,14 +46,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; GFX9-LABEL: s_uaddo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_add_u32 s0, s6, s0 +; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -75,8 +75,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_uaddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -95,8 +95,8 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_uaddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -111,12 +111,12 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_uaddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -132,7 +132,7 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -210,7 +210,7 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i32_novcc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_uaddo_i32_novcc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -268,7 +268,7 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_uaddo_i32_novcc: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -301,7 +301,7 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; SI-LABEL: s_uaddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -325,7 +325,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_uaddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -345,7 +345,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_uaddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s6, s4, s6 @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -454,7 +454,7 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -486,7 +486,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -508,7 +508,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] @@ -537,7 +537,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_uaddo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -568,7 +568,7 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_uaddo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -591,7 +591,7 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_uaddo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -618,45 +618,45 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_uaddo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: s_cmp_eq_u32 s0, s1 -; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; SI-NEXT: s_cmp_eq_u32 s2, s3 +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uaddo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: s_cmp_eq_u32 s0, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_cmp_eq_u32 s2, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; VI-NEXT: s_xor_b64 s[2:3], vcc, -1 ; VI-NEXT: .LBB8_2: ; %exit -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 @@ -668,19 +668,19 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: s_uaddo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_cmp_eq_u32 s2, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GFX9-NEXT: .LBB8_2: ; %exit -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: global_store_byte v1, v2, s[6:7] @@ -706,7 +706,7 @@ exit: define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s14, s2 @@ -740,7 +740,7 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_uaddo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -767,7 +767,7 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_uaddo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index dfd9a650ff0e96..f686aad0cefc25 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -44,7 +44,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -80,7 +80,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GCN-LABEL: udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -112,7 +112,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX1030-LABEL: udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -185,7 +185,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: s_udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -218,7 +218,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; VI-LABEL: s_udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -251,7 +251,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_sub_i32 s4, 0, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX1030-LABEL: s_udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX1030-NEXT: s_sub_i32 s5, 0, s3 @@ -346,7 +346,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -456,7 +456,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -507,7 +507,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v2i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -619,7 +619,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s6, s10 @@ -714,7 +714,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s6, s10 @@ -809,7 +809,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 16 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -904,7 +904,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v4i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x1 @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_pow2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1116,7 +1116,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: udiv_i32_div_pow2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1134,7 +1134,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GCN-LABEL: udiv_i32_div_pow2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1148,7 +1148,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GFX1030-LABEL: udiv_i32_div_pow2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_even: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1203,7 +1203,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: udiv_i32_div_k_even: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GCN-LABEL: udiv_i32_div_k_even: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GFX1030-LABEL: udiv_i32_div_k_even: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1277,7 +1277,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_odd: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: udiv_i32_div_k_odd: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GCN-LABEL: udiv_i32_div_k_odd: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GFX1030-LABEL: udiv_i32_div_k_odd: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_udiv_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1429,7 +1429,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GCN-LABEL: v_udiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1452,7 +1452,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX1030-LABEL: v_udiv_i8: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1540,7 +1540,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1569,7 +1569,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i16: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1651,7 +1651,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1688,7 +1688,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1725,7 +1725,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1770,7 +1770,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i23: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -1848,7 +1848,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i24: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1885,7 +1885,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i24: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1922,7 +1922,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1967,7 +1967,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i24: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -2048,7 +2048,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { ; SI-LABEL: scalarize_mulhu_4xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2076,7 +2076,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; VI-LABEL: scalarize_mulhu_4xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2104,7 +2104,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GCN-LABEL: scalarize_mulhu_4xi32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -2130,7 +2130,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GFX1030-LABEL: scalarize_mulhu_4xi32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -2193,7 +2193,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read define amdgpu_kernel void @test_udiv2(i32 %p) { ; SI-LABEL: test_udiv2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; VI-LABEL: test_udiv2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2217,7 +2217,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GCN-LABEL: test_udiv2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -2227,7 +2227,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GFX1030-LABEL: test_udiv2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 ; GFX1030-NEXT: v_mov_b32_e32 v0, s0 @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; SI-LABEL: test_udiv_3_mulhu: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; VI-LABEL: test_udiv_3_mulhu: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -2279,7 +2279,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GCN-LABEL: test_udiv_3_mulhu: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2290,7 +2290,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GFX1030-LABEL: test_udiv_3_mulhu: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0xaaaaaaab ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 78f85569f849d7..84906ac1f27ba9 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -123,8 +123,8 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; ; GCN-IR-LABEL: s_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -398,8 +398,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -423,8 +423,8 @@ define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_udiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -497,17 +497,17 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv32_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s8, s[2:3], 0xe +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s0, 0, s8 +; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 @@ -533,17 +533,17 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv32_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s8, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 @@ -576,18 +576,18 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv31_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s8, s0, 1 +; GCN-NEXT: s_lshr_b32 s8, s2, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s0, 0, s8 +; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 1 @@ -614,18 +614,18 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv31_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s8, s0, 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s2, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 @@ -659,8 +659,8 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv23_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -684,8 +684,8 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv23_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -716,14 +716,12 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_udiv24_i48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s0, s0, 0xff000000 -; GCN-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NEXT: s_and_b32 s0, s2, 0xff000000 +; GCN-NEXT: s_and_b32 s1, s3, 0xffff ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 @@ -734,18 +732,20 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_sub_u32 s8, 0, s0 ; GCN-NEXT: s_subb_u32 s9, 0, s1 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_mul_lo_u32 v3, s8, v2 ; GCN-NEXT: v_mul_hi_u32 v4, s8, v1 ; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 ; GCN-NEXT: v_mul_lo_u32 v6, s8, v1 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 @@ -831,20 +831,20 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s1, s5, 0xffff -; GCN-IR-NEXT: s_and_b32 s0, s4, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s3, s5, 0xffff +; GCN-IR-NEXT: s_and_b32 s2, s4, 0xff000000 ; GCN-IR-NEXT: s_and_b32 s5, s7, 0xffff ; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000 -; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[0:1], 24 -; GCN-IR-NEXT: s_lshr_b64 s[0:1], s[4:5], 24 +; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24 ; GCN-IR-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[0:1], 0 +; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[0:1] +; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[2:3] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GCN-IR-NEXT: s_flbit_i32_b64 s16, s[8:9] ; GCN-IR-NEXT: s_sub_u32 s12, s10, s16 @@ -869,8 +869,8 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s14 -; GCN-IR-NEXT: s_add_u32 s14, s0, -1 -; GCN-IR-NEXT: s_addc_u32 s15, s1, -1 +; GCN-IR-NEXT: s_add_u32 s14, s2, -1 +; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 ; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11] ; GCN-IR-NEXT: s_add_u32 s8, s4, s16 ; GCN-IR-NEXT: s_addc_u32 s9, s5, 0 @@ -888,7 +888,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_and_b32 s4, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[0:1] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 @@ -898,10 +898,10 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-IR-NEXT: .LBB7_4: ; %Flow4 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[0:1] +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 +; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB7_5: ; %udiv-end -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 @@ -920,7 +920,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1025,7 +1025,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_udiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] @@ -1364,7 +1364,7 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_add_u32 s1, 0, 0xaaaa0000 ; GCN-NEXT: v_not_b32_e32 v0, 23 ; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 @@ -1443,7 +1443,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_udiv_k_den_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] ; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 @@ -1661,7 +1661,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 8 @@ -1682,7 +1682,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_udiv24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 @@ -1709,7 +1709,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1731,7 +1731,7 @@ define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_udiv24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index 1468c7b99b5c25..f0f0b6680e0e6e 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -36,22 +36,22 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; ; GFX6-LABEL: test_udivrem: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x26 -; GFX6-NEXT: s_load_dword s9, s[2:3], 0x1d +; GFX6-NEXT: s_load_dword s8, s[0:1], 0x26 +; GFX6-NEXT: s_load_dword s9, s[0:1], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_sub_i32 s2, 0, s8 +; GFX6-NEXT: s_mov_b32 s3, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_mov_b32 s2, s6 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_readfirstlane_b32 s10, v0 @@ -69,34 +69,33 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_cselect_b32 s8, s10, s9 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x98 -; GFX8-NEXT: s_load_dword s5, s[2:3], 0x74 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x98 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX8-NEXT: s_sub_i32 s0, 0, s4 +; GFX8-NEXT: s_sub_i32 s2, 0, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c +; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_readfirstlane_b32 s0, v4 ; GFX8-NEXT: s_mul_i32 s0, s0, s4 ; GFX8-NEXT: s_sub_i32 s0, s5, s0 @@ -164,33 +163,33 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; ; GFX6-LABEL: test_udivrem_v2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s6 -; GFX6-NEXT: s_sub_i32 s0, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s6 -; GFX6-NEXT: s_cmp_ge_u32 s0, s6 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s6 -; GFX6-NEXT: s_cmp_ge_u32 s0, s6 -; GFX6-NEXT: s_cselect_b32 s4, s1, s0 -; GFX6-NEXT: s_sub_i32 s0, 0, s7 -; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s6 +; GFX6-NEXT: s_sub_i32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s4, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s7 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 @@ -207,46 +206,44 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX6-NEXT: s_cselect_b32 s5, s6, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem_v2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX8-NEXT: s_sub_i32 s0, 0, s6 +; GFX8-NEXT: s_sub_i32 s2, 0, s6 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s6 -; GFX8-NEXT: s_sub_i32 s0, s4, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s6 -; GFX8-NEXT: s_cmp_ge_u32 s0, s6 -; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s6 -; GFX8-NEXT: s_cmp_ge_u32 s0, s6 -; GFX8-NEXT: s_cselect_b32 s4, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: s_mul_i32 s2, s2, s6 +; GFX8-NEXT: s_sub_i32 s2, s4, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s6 +; GFX8-NEXT: s_cmp_ge_u32 s2, s6 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s6 +; GFX8-NEXT: s_cmp_ge_u32 s2, s6 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, 0, s7 +; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: s_mul_i32 s2, s2, s7 ; GFX8-NEXT: s_sub_i32 s2, s5, s2 @@ -335,36 +332,34 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; ; GFX6-LABEL: test_udivrem_v4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_sub_i32 s2, 0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s8 -; GFX6-NEXT: s_sub_i32 s0, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, s1, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s8 +; GFX6-NEXT: s_sub_i32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s4, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 @@ -372,82 +367,87 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s9 -; GFX6-NEXT: s_sub_i32 s1, s5, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, 0, s10 -; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s9 +; GFX6-NEXT: s_sub_i32 s2, s5, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s5, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s10 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s10 -; GFX6-NEXT: s_sub_i32 s4, s6, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, 0, s11 -; GFX6-NEXT: v_mul_lo_u32 v0, s5, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s10 +; GFX6-NEXT: s_sub_i32 s2, s6, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s6, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s11 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v2 -; GFX6-NEXT: s_mul_i32 s0, s0, s11 -; GFX6-NEXT: s_sub_i32 s0, s7, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: s_mul_i32 s4, s4, s11 +; GFX6-NEXT: s_sub_i32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem_v4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_sub_i32 s2, 0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s9 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s10 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s8 -; GFX8-NEXT: s_sub_i32 s0, s4, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s8 -; GFX8-NEXT: s_cmp_ge_u32 s0, s8 -; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s8 -; GFX8-NEXT: s_cmp_ge_u32 s0, s8 -; GFX8-NEXT: s_cselect_b32 s4, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s9 -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: s_mul_i32 s2, s2, s8 +; GFX8-NEXT: s_sub_i32 s2, s4, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s8 +; GFX8-NEXT: s_cmp_ge_u32 s2, s8 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s8 +; GFX8-NEXT: s_cmp_ge_u32 s2, s8 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, 0, s9 +; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 @@ -455,44 +455,40 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s9 -; GFX8-NEXT: s_sub_i32 s0, s5, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s9 -; GFX8-NEXT: s_cmp_ge_u32 s0, s9 -; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s9 -; GFX8-NEXT: s_cmp_ge_u32 s0, s9 -; GFX8-NEXT: s_cselect_b32 s5, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s10 -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: s_mul_i32 s3, s3, s9 +; GFX8-NEXT: s_sub_i32 s3, s5, s3 +; GFX8-NEXT: s_sub_i32 s4, s3, s9 +; GFX8-NEXT: s_cmp_ge_u32 s3, s9 +; GFX8-NEXT: s_cselect_b32 s3, s4, s3 +; GFX8-NEXT: s_sub_i32 s4, s3, s9 +; GFX8-NEXT: s_cmp_ge_u32 s3, s9 +; GFX8-NEXT: s_cselect_b32 s3, s4, s3 +; GFX8-NEXT: s_sub_i32 s4, 0, s10 +; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s10 -; GFX8-NEXT: s_sub_i32 s0, s6, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s10 -; GFX8-NEXT: s_cmp_ge_u32 s0, s10 -; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s10 -; GFX8-NEXT: s_cmp_ge_u32 s0, s10 -; GFX8-NEXT: s_cselect_b32 s6, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s11 -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: s_mul_i32 s4, s4, s10 +; GFX8-NEXT: s_sub_i32 s4, s6, s4 +; GFX8-NEXT: s_sub_i32 s5, s4, s10 +; GFX8-NEXT: s_cmp_ge_u32 s4, s10 +; GFX8-NEXT: s_cselect_b32 s4, s5, s4 +; GFX8-NEXT: s_sub_i32 s5, s4, s10 +; GFX8-NEXT: s_cmp_ge_u32 s4, s10 +; GFX8-NEXT: s_cselect_b32 s4, s5, s4 +; GFX8-NEXT: s_sub_i32 s5, 0, s11 +; GFX8-NEXT: v_mul_lo_u32 v0, s5, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v3, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_readfirstlane_b32 s2, v3 ; GFX8-NEXT: s_mul_i32 s2, s2, s11 ; GFX8-NEXT: s_sub_i32 s2, s7, s2 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index d00ea6dff24474..ba52d702c7ed11 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -26,7 +26,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -53,7 +53,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -66,7 +66,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; ; VI-LABEL: s_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -84,8 +84,8 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 x i64> %in) { ; SI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -102,12 +102,12 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; VI-NEXT: v_ldexp_f64 v[4:5], v[2:3], 32 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -126,8 +126,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) { ; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s11 ; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s9 @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; ; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s15 ; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s13 @@ -194,8 +194,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_uint_to_fp_i32_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; SI-NEXT: v_mov_b32_e32 v3, s1 @@ -205,8 +205,8 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; ; VI-LABEL: s_uint_to_fp_i32_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -221,7 +221,7 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -237,8 +237,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 x i32> %in) { ; SI-LABEL: s_uint_to_fp_v4i32_to_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -257,8 +257,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; ; VI-LABEL: s_uint_to_fp_v4i32_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -284,8 +284,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: uint_to_fp_i1_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -298,8 +298,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: uint_to_fp_i1_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -318,8 +318,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %in) { ; SI-LABEL: uint_to_fp_i1_to_f64_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s2, 0 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -332,8 +332,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; ; VI-LABEL: uint_to_fp_i1_to_f64_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -351,8 +351,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) { ; SI-LABEL: s_uint_to_fp_i8_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xff ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -363,8 +363,8 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; ; VI-LABEL: s_uint_to_fp_i8_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -400,8 +400,8 @@ define double @v_uint_to_fp_i8_to_f64(i8 %in) { define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_select_uint_to_fp_i1_vals_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -414,8 +414,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_uint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -451,8 +451,8 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_select_uint_to_fp_i1_vals_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -465,8 +465,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_uint_to_fp_i1_vals_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -503,8 +503,8 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -517,8 +517,8 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; ; VI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll index 3d0fc4e6281a6b..79b0a966bc1fbd 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_uint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_uint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s4, s3 ; GFX8-NEXT: s_min_u32 s4, s4, 32 @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_uint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s4, s3 @@ -75,7 +75,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_uint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -126,27 +126,26 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_uint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v3, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -162,7 +161,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_uint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -181,7 +180,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_uint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s4, s3 ; GFX8-NEXT: s_min_u32 s4, s4, 32 @@ -198,7 +197,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_uint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s4, s3 @@ -225,7 +224,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -249,7 +248,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_uint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -274,26 +273,24 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_uint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v3, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -309,8 +306,8 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -335,8 +332,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s2, s7 ; GFX8-NEXT: s_flbit_i32_b32 s3, s5 @@ -362,8 +359,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s2, s7 @@ -395,7 +392,7 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -446,7 +443,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -499,53 +496,51 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_clz_i32_u32_e32 v9, v3 -; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4 +; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v11, v7 -; GFX11-NEXT: v_clz_i32_u32_e32 v12, v5 +; GFX11-NEXT: v_clz_i32_u32_e32 v11, v8 +; GFX11-NEXT: v_clz_i32_u32_e32 v12, v6 ; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 ; GFX11-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 -; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 +; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v8 -; GFX11-NEXT: v_ldexp_f32 v3, v2, v9 -; GFX11-NEXT: v_ldexp_f32 v2, v0, v10 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 -; GFX11-NEXT: v_ldexp_f32 v0, v5, v4 -; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: v_cvt_f32_u32_e32 v6, v2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 4, v0 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 +; GFX11-NEXT: v_ldexp_f32 v2, v1, v10 +; GFX11-NEXT: v_ldexp_f32 v1, v6, v11 +; GFX11-NEXT: v_ldexp_f32 v0, v4, v5 +; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -561,8 +556,8 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -591,8 +586,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s2, s7 ; GFX8-NEXT: s_flbit_i32_b32 s3, s5 @@ -621,8 +616,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s2, s7 @@ -659,7 +654,7 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -718,7 +713,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -777,61 +772,59 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_clz_i32_u32_e32 v9, v3 -; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4 +; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v11, v7 -; GFX11-NEXT: v_clz_i32_u32_e32 v12, v5 +; GFX11-NEXT: v_clz_i32_u32_e32 v11, v8 +; GFX11-NEXT: v_clz_i32_u32_e32 v12, v6 ; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 ; GFX11-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 -; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v8 -; GFX11-NEXT: v_ldexp_f32 v2, v2, v9 -; GFX11-NEXT: v_ldexp_f32 v0, v0, v10 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 -; GFX11-NEXT: v_ldexp_f32 v3, v3, v4 +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v1 +; GFX11-NEXT: v_ldexp_f32 v2, v2, v11 +; GFX11-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v1, v0, v2 -; GFX11-NEXT: v_pack_b32_f16 v0, v3, v4 +; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index c21ae434f44709..5f8d0f665a953d 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; SI-LABEL: uitofp_i16_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; ; VI-LABEL: uitofp_i16_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; ; GFX11-LABEL: uitofp_i16_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @uitofp_i32_to_f16( ; SI-LABEL: uitofp_i32_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; ; VI-LABEL: uitofp_i32_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; ; GFX11-LABEL: uitofp_i32_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ entry: define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; SI-LABEL: uitofp_v2i16_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -168,7 +168,7 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; ; VI-LABEL: uitofp_v2i16_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -188,7 +188,7 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; ; GFX11-LABEL: uitofp_v2i16_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -221,7 +221,7 @@ entry: define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; SI-LABEL: uitofp_v2i32_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -244,7 +244,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; ; VI-LABEL: uitofp_v2i32_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -266,7 +266,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; ; GFX11-LABEL: uitofp_v2i32_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -301,21 +301,19 @@ entry: define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_uint_to_fp_i1_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -323,26 +321,26 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uint_to_fp_i1_to_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -350,14 +348,16 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_uint_to_fp_i1_to_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll index a3fc6ded0a0047..f60a274f1e592b 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -5,36 +5,36 @@ define amdgpu_kernel void @uniform_if_scc(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s0, 0 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_cmp_eq_u32 s2, 0 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_cbranch_scc1 .LBB0_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB0_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_scc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s0, 0 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_cmp_eq_u32 s2, 0 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB0_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB0_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -57,38 +57,38 @@ done: define amdgpu_kernel void @uniform_if_vcc(float %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_vcc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_load_dword s3, s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], s1, 0 +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], s3, 0 ; SI-NEXT: s_and_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB1_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_vcc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s1, s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_load_dword s3, s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], s1, 0 +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], s3, 0 ; VI-NEXT: s_and_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB1_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB1_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -111,36 +111,36 @@ done: define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_swap_br_targets_scc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_cbranch_scc1 .LBB2_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB2_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_swap_br_targets_scc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB2_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB2_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -163,38 +163,38 @@ done: define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_swap_br_targets_vcc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_load_dword s3, s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_neq_f32_e64 s[4:5], s1, 0 +; SI-NEXT: v_cmp_neq_f32_e64 s[4:5], s3, 0 ; SI-NEXT: s_and_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB3_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB3_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_swap_br_targets_vcc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s1, s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_load_dword s3, s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], s1, 0 +; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], s3, 0 ; VI-NEXT: s_and_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB3_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB3_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -219,14 +219,14 @@ done: define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a) { ; SI-LABEL: uniform_if_move_valu: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e32 v0, s0, v0 +; SI-NEXT: v_add_f32_e32 v0, s2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 ; SI-NEXT: s_cbranch_vccnz .LBB4_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -237,14 +237,14 @@ define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a) ; ; VI-LABEL: uniform_if_move_valu: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-NEXT: v_add_f32_e32 v0, s2, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 ; VI-NEXT: s_cbranch_vccnz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -271,14 +271,14 @@ endif: define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, float %a) { ; SI-LABEL: uniform_if_move_valu_commute: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e32 v0, s0, v0 +; SI-NEXT: v_add_f32_e32 v0, s2, v0 ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0 ; SI-NEXT: s_cbranch_vccnz .LBB5_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -289,14 +289,14 @@ define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, f ; ; VI-LABEL: uniform_if_move_valu_commute: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-NEXT: v_add_f32_e32 v0, s2, v0 ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0 ; VI-NEXT: s_cbranch_vccnz .LBB5_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -322,36 +322,38 @@ endif: define amdgpu_kernel void @uniform_if_else_ret(ptr addrspace(1) nocapture %out, i32 %a) { ; SI-LABEL: uniform_if_else_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB6_2 ; SI-NEXT: ; %bb.1: ; %if.else +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB6_2: ; %if.then +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_else_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB6_2 ; VI-NEXT: ; %bb.1: ; %if.else +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB6_2: ; %if.then +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -374,8 +376,8 @@ if.end: ; preds = %if.else, %if.then define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr addrspace(1) nocapture %out1, i32 %a) { ; SI-LABEL: uniform_if_else: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -401,8 +403,8 @@ define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr ; ; VI-LABEL: uniform_if_else: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -444,17 +446,17 @@ if.end: ; preds = %if.else, %if.then define amdgpu_kernel void @icmp_2_users(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: icmp_2_users: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_gt_i32 s4, 0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: s_cmp_lt_i32 s4, 1 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %IF -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: .LBB8_2: ; %ENDIF @@ -462,17 +464,17 @@ define amdgpu_kernel void @icmp_2_users(ptr addrspace(1) %out, i32 %cond) { ; ; VI-LABEL: icmp_2_users: ; VI: ; %bb.0: ; %main_body -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_gt_i32 s4, 0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_cmp_lt_i32 s4, 1 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %IF -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: .LBB8_2: ; %ENDIF @@ -493,20 +495,20 @@ ENDIF: ; preds = %IF, %main_body define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, ptr addrspace(1) %out) { ; SI-LABEL: icmp_users_different_blocks: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_i32 s0, 1 +; SI-NEXT: s_cmp_lt_i32 s2, 1 ; SI-NEXT: s_cbranch_scc1 .LBB9_2 ; SI-NEXT: ; %bb.1: ; %bb2 -; SI-NEXT: s_cmp_gt_i32 s1, 0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-NEXT: s_cmp_gt_i32 s3, 0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %bb9 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB9_3: ; %bb7 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -516,20 +518,20 @@ define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, p ; ; VI-LABEL: icmp_users_different_blocks: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lt_i32 s0, 1 +; VI-NEXT: s_cmp_lt_i32 s2, 1 ; VI-NEXT: s_cbranch_scc1 .LBB9_2 ; VI-NEXT: ; %bb.1: ; %bb2 -; VI-NEXT: s_cmp_gt_i32 s1, 0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_and_b64 vcc, exec, s[0:1] +; VI-NEXT: s_cmp_gt_i32 s3, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB9_3 ; VI-NEXT: .LBB9_2: ; %bb9 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB9_3: ; %bb7 -; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -558,7 +560,7 @@ bb9: ; preds = %bb8, %bb4 define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: uniform_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xb ; SI-NEXT: .LBB10_1: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -570,7 +572,7 @@ define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) { ; ; VI-LABEL: uniform_loop: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: .LBB10_1: ; %loop ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -598,11 +600,11 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; SI-LABEL: uniform_inside_divergent: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_cbranch_execz .LBB11_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -621,11 +623,11 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; VI-LABEL: uniform_inside_divergent: ; VI: ; %bb.0: ; %entry ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; VI-NEXT: s_cbranch_execz .LBB11_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -660,14 +662,14 @@ endif: define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: divergent_inside_uniform: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB12_2 ; SI-NEXT: .LBB12_1: ; %endif ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB12_2: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -683,14 +685,14 @@ define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 % ; ; VI-LABEL: divergent_inside_uniform: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB12_2 ; VI-NEXT: .LBB12_1: ; %endif ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB12_2: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -724,9 +726,9 @@ endif: define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: divergent_if_uniform_if: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_cbranch_execz .LBB13_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -735,8 +737,8 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: .LBB13_2: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_load_dword s0, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 @@ -752,9 +754,9 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; ; VI-LABEL: divergent_if_uniform_if: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; VI-NEXT: s_cbranch_execz .LBB13_2 ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: s_mov_b32 s7, 0xf000 @@ -763,8 +765,8 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: .LBB13_2: ; %endif -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_or_b64 exec, exec, s[2:3] +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 @@ -805,12 +807,12 @@ exit: define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: cse_uniform_condition_different_blocks: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_i32 s0, 1 +; SI-NEXT: s_cmp_lt_i32 s2, 1 ; SI-NEXT: s_cbranch_scc1 .LBB14_2 ; SI-NEXT: ; %bb.1: ; %bb2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -825,12 +827,12 @@ define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr ; ; VI-LABEL: cse_uniform_condition_different_blocks: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lt_i32 s0, 1 +; VI-NEXT: s_cmp_lt_i32 s2, 1 ; VI-NEXT: s_cbranch_scc1 .LBB14_2 ; VI-NEXT: ; %bb.1: ; %bb2 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -865,7 +867,7 @@ bb9: ; preds = %bb8, %bb4 define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_eq_u64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -884,7 +886,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %ou ; ; VI-LABEL: uniform_if_scc_i64_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u64 s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -919,7 +921,7 @@ done: define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_ne: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -938,7 +940,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %ou ; ; VI-LABEL: uniform_if_scc_i64_ne: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -973,7 +975,7 @@ done: define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_sgt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -992,7 +994,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %o ; ; VI-LABEL: uniform_if_scc_i64_sgt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -1029,17 +1031,17 @@ define amdgpu_kernel void @move_to_valu_i64_eq(ptr addrspace(1) %out) { ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; SI-NEXT: s_cbranch_vccnz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB18_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1048,17 +1050,17 @@ define amdgpu_kernel void @move_to_valu_i64_eq(ptr addrspace(1) %out) { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_read_b64 v[0:1], v0 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; VI-NEXT: s_cbranch_vccnz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB18_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1083,17 +1085,17 @@ define amdgpu_kernel void @move_to_valu_i64_ne(ptr addrspace(1) %out) { ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; SI-NEXT: s_cbranch_vccnz .LBB19_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB19_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1102,17 +1104,17 @@ define amdgpu_kernel void @move_to_valu_i64_ne(ptr addrspace(1) %out) { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_read_b64 v[0:1], v0 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; VI-NEXT: s_cbranch_vccnz .LBB19_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB19_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll index 18b2397bbd5a7e..0cb408676552e1 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX90A-LABEL: test_insert_extract: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: s_mov_b32 s2, 0 ; GFX90A-NEXT: s_and_b64 vcc, exec, -1 ; GFX90A-NEXT: s_mov_b32 s3, 0 @@ -55,7 +55,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX940-LABEL: test_insert_extract: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-NEXT: s_mov_b32 s2, 0 ; GFX940-NEXT: s_and_b64 vcc, exec, -1 ; GFX940-NEXT: s_mov_b32 s3, 0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX1030-LABEL: test_insert_extract: ; GFX1030: ; %bb.0: ; %entry -; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX1030-NEXT: s_mov_b32 s2, 0 ; GFX1030-NEXT: s_mov_b32 s3, 0 ; GFX1030-NEXT: s_mov_b32 s4, 0 @@ -151,7 +151,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX1100-LABEL: test_insert_extract: ; GFX1100: ; %bb.0: ; %entry -; GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1100-NEXT: s_mov_b32 s2, 0 ; GFX1100-NEXT: s_mov_b32 s3, 0 ; GFX1100-NEXT: s_mov_b32 s4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 63105453174ebe..c0c84d46b7356b 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -122,8 +122,8 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; ; GCN-IR-LABEL: s_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -413,18 +413,18 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem31_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s8, s0, 1 +; GCN-NEXT: s_lshr_b32 s8, s2, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s0, 0, s8 +; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 1 @@ -448,18 +448,18 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_urem31_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s8, s0, 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s2, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 @@ -490,112 +490,110 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem31_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_lshr_b32 s0, s9, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_sub_i32 s1, 0, s0 -; GCN-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-NEXT: s_lshr_b32 s8, s7, 1 +; GCN-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_sub_i32 s3, 0, s2 +; GCN-NEXT: s_lshr_b32 s4, s11, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-NEXT: s_lshr_b32 s3, s5, 1 +; GCN-NEXT: s_lshr_b32 s5, s7, 1 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_mul_i32 s5, s5, s0 -; GCN-NEXT: s_sub_i32 s4, s4, s5 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-NEXT: s_sub_i32 s4, 0, s1 -; GCN-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: s_mul_i32 s6, s6, s2 +; GCN-NEXT: s_sub_i32 s3, s3, s6 +; GCN-NEXT: s_sub_i32 s6, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-NEXT: s_sub_i32 s6, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s6, s6, s3 +; GCN-NEXT: s_sub_i32 s2, 0, s4 +; GCN-NEXT: v_mul_lo_u32 v0, s2, v1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mul_hi_u32 v2, s5, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-NEXT: s_mul_i32 s0, s0, s1 -; GCN-NEXT: s_sub_i32 s0, s8, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NEXT: s_mul_i32 s6, s6, s4 +; GCN-NEXT: s_sub_i32 s5, s5, s6 +; GCN-NEXT: s_sub_i32 s6, s5, s4 +; GCN-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-NEXT: s_cselect_b32 s5, s6, s5 +; GCN-NEXT: s_sub_i32 s6, s5, s4 +; GCN-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem31_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_lshr_b32 s0, s9, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_sub_i32 s1, 0, s0 -; GCN-IR-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-IR-NEXT: s_lshr_b32 s8, s7, 1 +; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_sub_i32 s3, 0, s2 +; GCN-IR-NEXT: s_lshr_b32 s4, s11, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-IR-NEXT: s_lshr_b32 s3, s5, 1 +; GCN-IR-NEXT: s_lshr_b32 s5, s7, 1 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-IR-NEXT: s_mul_i32 s5, s5, s0 -; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s4, 0, s1 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-IR-NEXT: s_mul_i32 s6, s6, s2 +; GCN-IR-NEXT: s_sub_i32 s3, s3, s6 +; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 +; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-IR-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 +; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-IR-NEXT: s_cselect_b32 s6, s6, s3 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s4 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v1 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s5, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s1 -; GCN-IR-NEXT: s_sub_i32 s0, s8, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-IR-NEXT: s_mul_i32 s6, s6, s4 +; GCN-IR-NEXT: s_sub_i32 s5, s5, s6 +; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 +; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-IR-NEXT: s_cselect_b32 s5, s6, s5 +; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 +; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-IR-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s4 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, @@ -607,8 +605,8 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem24_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_lshr_b32 s4, s4, 8 @@ -632,8 +630,8 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_urem24_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_lshr_b32 s4, s4, 8 @@ -664,112 +662,110 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem23_64_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_lshr_b32 s0, s9, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_sub_i32 s1, 0, s0 -; GCN-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-NEXT: s_lshr_b32 s8, s7, 9 +; GCN-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_sub_i32 s3, 0, s2 +; GCN-NEXT: s_lshr_b32 s4, s11, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-NEXT: s_lshr_b32 s3, s5, 1 +; GCN-NEXT: s_lshr_b32 s5, s7, 9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_mul_i32 s5, s5, s0 -; GCN-NEXT: s_sub_i32 s4, s4, s5 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-NEXT: s_sub_i32 s4, 0, s1 -; GCN-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: s_mul_i32 s6, s6, s2 +; GCN-NEXT: s_sub_i32 s3, s3, s6 +; GCN-NEXT: s_sub_i32 s6, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-NEXT: s_sub_i32 s6, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s6, s6, s3 +; GCN-NEXT: s_sub_i32 s2, 0, s4 +; GCN-NEXT: v_mul_lo_u32 v0, s2, v1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mul_hi_u32 v2, s5, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-NEXT: s_mul_i32 s0, s0, s1 -; GCN-NEXT: s_sub_i32 s0, s8, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NEXT: s_mul_i32 s6, s6, s4 +; GCN-NEXT: s_sub_i32 s5, s5, s6 +; GCN-NEXT: s_sub_i32 s6, s5, s4 +; GCN-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-NEXT: s_cselect_b32 s5, s6, s5 +; GCN-NEXT: s_sub_i32 s6, s5, s4 +; GCN-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem23_64_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_lshr_b32 s0, s9, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_sub_i32 s1, 0, s0 -; GCN-IR-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-IR-NEXT: s_lshr_b32 s8, s7, 9 +; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_sub_i32 s3, 0, s2 +; GCN-IR-NEXT: s_lshr_b32 s4, s11, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-IR-NEXT: s_lshr_b32 s3, s5, 1 +; GCN-IR-NEXT: s_lshr_b32 s5, s7, 9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-IR-NEXT: s_mul_i32 s5, s5, s0 -; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s4, 0, s1 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-IR-NEXT: s_mul_i32 s6, s6, s2 +; GCN-IR-NEXT: s_sub_i32 s3, s3, s6 +; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 +; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-IR-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 +; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-IR-NEXT: s_cselect_b32 s6, s6, s3 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s4 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v1 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s5, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s1 -; GCN-IR-NEXT: s_sub_i32 s0, s8, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-IR-NEXT: s_mul_i32 s6, s6, s4 +; GCN-IR-NEXT: s_sub_i32 s5, s5, s6 +; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 +; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-IR-NEXT: s_cselect_b32 s5, s6, s5 +; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 +; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-IR-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s4 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, @@ -781,7 +777,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -885,7 +881,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_urem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] @@ -965,6 +961,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem_k_den_i64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_add_u32 s0, 0, 0xaaaa0000 ; GCN-NEXT: v_not_b32_e32 v0, 23 ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -980,7 +977,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v1, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 ; GCN-NEXT: s_mul_i32 s8, s1, s8 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_mul_hi_u32 v4, s1, v0 @@ -1003,8 +1000,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -1013,7 +1010,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 -; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_mov_b32_e32 v2, s7 @@ -1042,7 +1038,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_urem_k_den_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] ; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 @@ -1393,7 +1389,7 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0x41c00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s2, -1 @@ -1416,7 +1412,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_urem24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s5, 0x41c00000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s2, -1 @@ -1445,7 +1441,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_urem24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_movk_i32 s4, 0x5b7f ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -1470,7 +1466,7 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_urem24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_movk_i32 s4, 0x5b7f ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index dacc9862059831..666ae7c126ae3e 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_usubo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,8 +28,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; VI-LABEL: s_usubo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_sub_u32 s0, s6, s0 @@ -47,14 +47,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; GFX9-LABEL: s_usubo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_sub_u32 s0, s6, s0 +; GFX9-NEXT: s_sub_u32 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_subb_u32 s1, s7, s1 +; GFX9-NEXT: s_subb_u32 s1, s7, s3 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -75,8 +75,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_usubo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -95,8 +95,8 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_usubo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -111,12 +111,12 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_usubo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -132,7 +132,7 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -210,7 +210,7 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i32_novcc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_usubo_i32_novcc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -268,7 +268,7 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_usubo_i32_novcc: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -301,7 +301,7 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; SI-LABEL: s_usubo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -325,7 +325,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_usubo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, s4, s6 @@ -345,7 +345,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_u32 s6, s4, s6 @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -454,7 +454,7 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -486,7 +486,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -508,7 +508,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] @@ -537,7 +537,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_usubo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -568,7 +568,7 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_usubo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -591,7 +591,7 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_usubo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -618,45 +618,45 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_usubo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: s_cmp_eq_u32 s0, s1 -; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; SI-NEXT: s_cmp_eq_u32 s2, s3 +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_usubo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: s_cmp_eq_u32 s0, s1 -; VI-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_cmp_eq_u32 s2, s3 +; VI-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; VI-NEXT: s_xor_b64 s[2:3], vcc, -1 ; VI-NEXT: .LBB8_2: ; %exit -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 @@ -668,19 +668,19 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: s_usubo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_cmp_eq_u32 s2, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GFX9-NEXT: .LBB8_2: ; %exit -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: global_store_byte v1, v2, s[6:7] @@ -707,7 +707,7 @@ exit: define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s14, s2 @@ -741,7 +741,7 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_usubo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -768,7 +768,7 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_usubo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll index 2210b6c0d3c3a4..ca4d689156b491 100644 --- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll +++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll @@ -25,7 +25,7 @@ bb: define amdgpu_kernel void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { ; GFX9-LABEL: test_add_co_sdwa: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll index 4b9b5f9ffdf84f..2fa9750653b6d2 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll @@ -23,7 +23,7 @@ entry: define amdgpu_kernel void @fcmp_test(half %x, half %y) { ; CHECK-LABEL: fcmp_test: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_b32 s0, s[2:3], 0x0 +; CHECK-NEXT: s_load_b32 s0, s[0:1], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 @@ -46,7 +46,7 @@ entry: define amdgpu_kernel void @ballot_test(half %x, half %y) { ; CHECK-LABEL: ballot_test: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_b32 s0, s[2:3], 0x0 +; CHECK-NEXT: s_load_b32 s0, s[0:1], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index a8f3635416cffa..fc6df735c05b0f 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -13,37 +13,37 @@ declare double @llvm.fabs.f64(double) define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cnd_nan_nosgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_cmp_eq_u32 s8, 0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cnd_nan_nosgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s4, 0 +; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc @@ -54,37 +54,35 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; ; GFX10-LABEL: v_cnd_nan_nosgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[0:1] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cnd_nan_nosgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cmp_eq_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -109,7 +107,7 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 { ; SI-LABEL: v_cnd_nan: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -124,7 +122,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; VI-LABEL: v_cnd_nan: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -137,7 +135,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX10-LABEL: v_cnd_nan: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s2, 0 @@ -148,7 +146,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX11-LABEL: v_cnd_nan: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s2, 0 @@ -171,30 +169,30 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -202,27 +200,26 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[0:1] ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] -; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -238,30 +235,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -269,25 +266,24 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -305,30 +301,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -336,27 +332,26 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[0:1] ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] -; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -372,30 +367,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -403,25 +398,24 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -439,16 +433,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -457,20 +451,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -478,34 +472,32 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -526,16 +518,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -544,20 +536,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -565,34 +557,32 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -613,8 +603,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -632,8 +622,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -652,10 +642,9 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -666,12 +655,9 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -695,8 +681,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -716,8 +702,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -741,13 +727,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc @@ -757,10 +743,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -789,8 +773,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -810,8 +794,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -835,13 +819,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc @@ -851,10 +835,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -883,8 +865,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -905,8 +887,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -931,13 +913,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc @@ -948,10 +930,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc @@ -981,8 +961,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1007,8 +987,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1036,14 +1016,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1056,10 +1036,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1092,8 +1070,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1118,8 +1096,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1147,14 +1125,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1167,10 +1145,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1205,8 +1181,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1231,8 +1207,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1260,14 +1236,14 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1280,10 +1256,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1316,8 +1290,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1341,8 +1315,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -1369,13 +1343,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v2, v1, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 @@ -1388,10 +1362,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc @@ -1426,8 +1398,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1451,8 +1423,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1479,14 +1451,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc @@ -1497,10 +1469,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1532,8 +1502,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1556,8 +1526,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1583,14 +1553,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1601,10 +1571,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1636,8 +1604,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1657,8 +1625,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; ; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1682,13 +1650,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc @@ -1698,10 +1666,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1731,8 +1697,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1756,8 +1722,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1785,13 +1751,13 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc @@ -1805,10 +1771,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1844,18 +1808,18 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 @@ -1863,22 +1827,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1891,15 +1855,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1907,23 +1871,21 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX10-NEXT: global_store_short v2, v0, s[0:1] +; GFX10-NEXT: global_store_short v2, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1948,37 +1910,37 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_cmp_lg_u32 s8, 0 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3] @@ -1989,37 +1951,35 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[0:1] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] @@ -2041,18 +2001,18 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 @@ -2060,22 +2020,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 @@ -2089,15 +2049,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2106,23 +2066,21 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll index 472a443cf6dde7..f7933d719f9893 100644 --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -6,57 +6,57 @@ define amdgpu_kernel void @madak_f16( ; SI-LABEL: madak_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madak_f32 v0, v0, v1, 0x41200000 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: madak_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_madak_f16 v0, v0, v1, 0x4900 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: madak_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -95,32 +95,32 @@ entry: define amdgpu_kernel void @madak_f16_use_2( ; SI-LABEL: madak_f16_use_2: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s18, s14 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s8 ; SI-NEXT: s_mov_b32 s17, s9 -; SI-NEXT: s_mov_b32 s19, s15 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, 0x41200000 -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -128,49 +128,49 @@ define amdgpu_kernel void @madak_f16_use_2( ; SI-NEXT: v_mac_f32_e32 v3, v0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: buffer_store_short v0, off, s[12:15], 0 -; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: madak_f16_use_2: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 -; VI-NEXT: s_mov_b32 s18, s14 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s8 ; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_mov_b32 s19, s15 +; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s14 -; VI-NEXT: s_mov_b32 s11, s15 -; VI-NEXT: s_mov_b32 s2, s14 -; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, 0x4900 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900 ; VI-NEXT: v_mac_f16_e32 v3, v0, v2 -; VI-NEXT: buffer_store_short v1, off, s[12:15], 0 -; VI-NEXT: buffer_store_short v3, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v3, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: madak_f16_use_2: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-NEXT: s_mov_b32 s14, -1 ; GFX11-NEXT: s_mov_b32 s15, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s14 diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 9f6d27802e1843..8bc8fbd0e0e846 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -24,7 +24,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace ; ; GISEL-LABEL: v_pack_b32_v2f16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -56,7 +56,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32_v2f16_sub: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -73,7 +73,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; ; GISEL-LABEL: v_pack_b32_v2f16_sub: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -105,7 +105,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs define amdgpu_kernel void @fptrunc( ; GCN-LABEL: fptrunc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s7, 0x31016000 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -125,7 +125,7 @@ define amdgpu_kernel void @fptrunc( ; ; GISEL-LABEL: fptrunc: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -147,7 +147,7 @@ define amdgpu_kernel void @fptrunc( define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32.fabs: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -164,7 +164,7 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( ; ; GISEL-LABEL: v_pack_b32.fabs: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32.fneg: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -215,7 +215,7 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace( ; ; GISEL-LABEL: v_pack_b32.fneg: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 8579cbdf47137d..89fef7eead839a 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -89,7 +89,7 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg %src0ext, i32 inreg %src1ext) { ; SDAG-VI-LABEL: basic_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 @@ -104,7 +104,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -117,7 +117,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_med3_i16 v0, s2, 0, 0xff @@ -132,7 +132,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-VI-LABEL: basic_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-VI-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -156,7 +156,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -413,13 +413,13 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> inreg %src) { ; SDAG-VI-LABEL: vec_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-VI-NEXT: v_max_i16_e64 v1, s4, 0 -; SDAG-VI-NEXT: v_max_i16_e64 v2, s2, 0 +; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16 +; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 +; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 ; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 ; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 @@ -430,24 +430,24 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; SDAG-GFX9-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; SDAG-GFX9-NEXT: s_movk_i32 s2, 0xff +; SDAG-GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX9-NEXT: s_movk_i32 s0, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX9-NEXT: v_pk_max_i16 v1, s4, 0 -; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s2 op_sel_hi:[1,0] -; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s0 op_sel_hi:[1,0] +; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX9-NEXT: s_endpgm ; ; SDAG-GFX11-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s4, 0 +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s2, 0 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] ; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -457,24 +457,24 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-VI-LABEL: vec_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GISEL-VI-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_sext_i32_i16 s3, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: s_lshr_b32 s3, s4, 16 +; GISEL-VI-NEXT: s_lshr_b32 s4, s2, 16 +; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, s4 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-VI-NEXT: s_max_i32 s4, s4, s2 -; GISEL-VI-NEXT: s_max_i32 s2, s3, s2 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s4 +; GISEL-VI-NEXT: s_max_i32 s2, s2, s3 +; GISEL-VI-NEXT: s_max_i32 s3, s4, s3 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff +; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-VI-NEXT: s_min_i32 s2, s2, s4 ; GISEL-VI-NEXT: s_min_i32 s3, s3, s4 -; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 +; GISEL-VI-NEXT: s_min_i32 s2, s2, s4 ; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 -; GISEL-VI-NEXT: s_lshl_b32 s2, s2, 16 -; GISEL-VI-NEXT: s_or_b32 s2, s3, s2 +; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 +; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 +; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -483,40 +483,40 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-GFX9-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s4 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, s4 ; GISEL-GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GISEL-GFX9-NEXT: s_max_i32 s2, s3, s2 -; GISEL-GFX9-NEXT: s_max_i32 s3, s4, 0 -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GISEL-GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX9-NEXT: s_max_i32 s0, s1, s0 +; GISEL-GFX9-NEXT: s_max_i32 s1, s4, 0 +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, s0 +; GISEL-GFX9-NEXT: s_ashr_i32 s0, s0, 16 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0xff00ff -; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s4 -; GISEL-GFX9-NEXT: s_min_i32 s2, s2, 0xff -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX9-NEXT: s_min_i32 s1, s1, s4 +; GISEL-GFX9-NEXT: s_min_i32 s0, s0, 0xff +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX9-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s4 -; GISEL-GFX11-NEXT: s_ashr_i32 s4, s4, 16 -; GISEL-GFX11-NEXT: s_max_i32 s2, s3, s2 -; GISEL-GFX11-NEXT: s_max_i32 s3, s4, 0 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 +; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX11-NEXT: s_max_i32 s3, s4, s3 +; GISEL-GFX11-NEXT: s_max_i32 s2, s2, 0 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0xff00ff ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 ; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll index 02a6024f858e9f..d5347f829002db 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll @@ -25,7 +25,7 @@ bb: define amdgpu_kernel void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { ; GFX9-LABEL: test_sub_co_sdwa: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll index f0cbeba1cfb743..eb88c790dfe729 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -10,8 +10,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_dynelt_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 @@ -34,27 +34,30 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx0, i32 %idx1) #1 { ; GCN-LABEL: extract_insert_different_dynelt_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[0:1], s[10:11] -; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64 +; GCN-NEXT: s_mov_b64 s[4:5], s[10:11] +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: s_load_dword s14, s[0:1], 0xf ; GCN-NEXT: s_cmp_eq_u32 s13, 3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s13, 2 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s13, 1 -; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s13, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s14, 1 ; GCN-NEXT: v_mov_b32_e32 v7, v5 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] @@ -84,8 +87,8 @@ define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(ptr addrspace(1 define amdgpu_kernel void @extract_insert_same_elt2_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_elt2_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 @@ -108,19 +111,19 @@ define amdgpu_kernel void @extract_insert_same_elt2_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in, float %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_dynelt_v4f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: buffer_store_dword v0, v[4:5], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 2797c5b7988810..66c49ba8b734db 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1585,47 +1585,45 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { ; GFX9-LABEL: fma_shuffle_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fma_shuffle_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fma_shuffle_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 @@ -1715,7 +1713,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) % define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ptr addrspace(1) %out) { ; GFX9-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 @@ -1729,7 +1727,7 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; ; GFX10-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 @@ -1743,7 +1741,7 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; ; GFX11-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -4239,8 +4237,8 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { ; GFX9-LABEL: fma_shuffle_v2bf16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: s_mov_b32 s3, 0x7060302 @@ -4323,8 +4321,8 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-LABEL: fma_shuffle_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 @@ -4406,14 +4404,12 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-LABEL: fma_shuffle_v2bf16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] ; GFX11-NEXT: global_load_b64 v[2:3], v6, s[4:5] ; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4424,43 +4420,43 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_fmac_f32 v1, v12, v4 :: v_dual_lshlrev_b32 v8, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_fmac_f32 v11, v12, v9 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_fmac_f32_e32 v1, v12, v4 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 ; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v9 ; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_add3_u32 v15, v15, v1, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX11-NEXT: v_dual_fmac_f32 v7, v8, v9 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4 ; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_fmac_f32_e32 v4, v2, v5 -; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v9 -; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc_lo +; GFX11-NEXT: v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4495,7 +4491,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo ; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 -; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 02da6deb96f1fe..340f0cdd5d5d07 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -462,9 +462,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-LABEL: name: livevariables_update_missed_block ; SI: bb.0.entry: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) - ; SI-NEXT: liveins: $vgpr0, $sgpr2_sgpr3 + ; SI-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr2_sgpr3 + ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr0_sgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 ; SI-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY1]](s32), implicit $exec ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -474,7 +474,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) - ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %54, 0, implicit $exec + ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %48, 0, implicit $exec ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) @@ -502,14 +502,14 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: bb.5.Flow: ; SI-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %55:vgpr_32, %bb.6 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %49:vgpr_32, %bb.6 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %41:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %35:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5 @@ -562,9 +562,9 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-LABEL: name: nested_waterfalls ; SI: bb.0.entry: ; SI-NEXT: successors: %bb.1(0x80000000) - ; SI-NEXT: liveins: $vgpr0, $sgpr2_sgpr3 + ; SI-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr2_sgpr3 + ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr0_sgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.1.if.then: @@ -635,7 +635,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: bb.5: ; SI-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %28:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %22:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc ; SI-NEXT: SI_WATERFALL_LOOP %bb.4, implicit $exec ; SI-NEXT: {{ $}} @@ -648,7 +648,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7: ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] - ; SI-NEXT: GLOBAL_STORE_DWORD undef %31:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; SI-NEXT: GLOBAL_STORE_DWORD undef %25:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; SI-NEXT: S_ENDPGM 0 entry: %0 = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 6410df7f69e2ac..1937c573820927 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v3i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX906-NEXT: v_and_or_b32 v4, v4, s4, v5 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v0, v2, s[6:7] @@ -28,9 +28,9 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX906-NEXT: v_and_or_b32 v4, v0, s4, v2 ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[0:1] offset:2 -; GFX906-NEXT: global_store_short v1, v4, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[2:3] offset:2 +; GFX906-NEXT: global_store_short v1, v4, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -52,21 +52,21 @@ bb.2: define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v4i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dword v2, v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v2, v3, s[6:7] ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v1, v2, s[0:1] +; GFX906-NEXT: global_store_dword v1, v2, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -88,8 +88,8 @@ bb.2: define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v5i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -97,16 +97,16 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: global_store_byte v3, v2, s[0:1] offset:4 -; GFX906-NEXT: global_store_dword v3, v1, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: global_store_byte v3, v2, s[2:3] offset:4 +; GFX906-NEXT: global_store_dword v3, v1, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -128,21 +128,21 @@ bb.2: define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v8i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -164,21 +164,21 @@ bb.2: define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v16i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[6:7] ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -200,25 +200,25 @@ bb.2: define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v10, 5, v0 ; GFX906-NEXT: v_mov_b32_e32 v9, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[4:5] offset:16 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[6:7] offset:16 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[6:7] ; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16 +; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[2:3] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -240,25 +240,25 @@ bb.2: define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v256i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v61, 3, v0 -; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s10, -1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:240 -; GFX906-NEXT: s_mov_b32 s14, -1 -; GFX906-NEXT: s_mov_b32 s15, 0xe00000 -; GFX906-NEXT: s_add_u32 s12, s12, s9 -; GFX906-NEXT: s_addc_u32 s13, s13, 0 +; GFX906-NEXT: s_mov_b32 s11, 0xe00000 +; GFX906-NEXT: s_add_u32 s8, s8, s3 +; GFX906-NEXT: s_addc_u32 s9, s9, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:224 ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[4:5] offset:208 @@ -280,11 +280,11 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] offset:240 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[6:7] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[6:7] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[6:7] offset:192 @@ -302,7 +302,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] ; GFX906-NEXT: .LBB6_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_waitcnt vmcnt(7) +; GFX906-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) ; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112 ; GFX906-NEXT: s_waitcnt vmcnt(7) ; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:96 @@ -318,11 +318,11 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(7) ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240 ; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1] offset:224 @@ -353,9 +353,9 @@ bb.2: define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: repeat_successor: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s8, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX906-NEXT: s_load_dword s8, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_cmp_lt_i32 s8, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 @@ -375,7 +375,7 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr ; GFX906-NEXT: .LBB7_5: ; %return.sink.split ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v1, v0, s[0:1] +; GFX906-NEXT: global_store_dword v1, v0, s[2:3] ; GFX906-NEXT: .LBB7_6: ; %return ; GFX906-NEXT: s_endpgm entry: @@ -405,7 +405,7 @@ return: define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_chain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -460,7 +460,7 @@ bb.3: define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_zeroinit: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2 @@ -522,7 +522,7 @@ bb.3: define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_const: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: ; implicit-def: $vgpr3 @@ -631,7 +631,7 @@ bb.3: define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_multi_block: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -682,25 +682,25 @@ bb.3: define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_loop_carried: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX906-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0 ; GFX906-NEXT: s_mov_b32 s4, 0x2000604 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v1, v1, s[0:1] -; GFX906-NEXT: s_mov_b64 s[0:1], 0 +; GFX906-NEXT: global_load_dword v1, v1, s[2:3] +; GFX906-NEXT: s_mov_b64 s[2:3], 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v0, v1 ; GFX906-NEXT: .LBB12_1: ; %bb.1 ; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: s_and_b64 s[6:7], exec, vcc -; GFX906-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GFX906-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] ; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_cbranch_execnz .LBB12_1 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dword v1, v0, s[0:1] @@ -728,9 +728,9 @@ bb.2: define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) { ; GFX906-LABEL: v8i8_multiuse_multiblock: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -874,13 +874,13 @@ bb.3: define amdgpu_kernel void @MissingInc_PhiChain(i1 %cmp, <16 x i8> %input) { ; GFX906-LABEL: MissingInc_PhiChain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX906-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX906-NEXT: s_mov_b32 s10, 1 ; GFX906-NEXT: v_mov_b32_e32 v4, 1 ; GFX906-NEXT: s_mov_b32 s11, 1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_bitcmp1_b32 s0, 0 +; GFX906-NEXT: s_bitcmp1_b32 s2, 0 ; GFX906-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX906-NEXT: s_xor_b64 s[0:1], s[2:3], -1 ; GFX906-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll index d7db68a433319c..1afe5cdea87233 100644 --- a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll @@ -18,29 +18,29 @@ declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 i define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-LABEL: foo: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX906-NEXT: s_mov_b32 s14, -1 -; GFX906-NEXT: s_mov_b32 s15, 0xe00000 -; GFX906-NEXT: s_add_u32 s12, s12, s9 -; GFX906-NEXT: s_addc_u32 s13, s13, 0 -; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 -; GFX906-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:4 -; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:8 -; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:12 -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x1c -; GFX906-NEXT: s_mov_b64 s[2:3], exec +; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s10, -1 +; GFX906-NEXT: s_mov_b32 s11, 0xe00000 +; GFX906-NEXT: s_add_u32 s8, s8, s3 +; GFX906-NEXT: s_addc_u32 s9, s9, 0 +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:4 +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:8 +; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:12 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1c ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_bitcmp1_b32 s4, 0 -; GFX906-NEXT: s_mul_i32 s0, s0, s1 -; GFX906-NEXT: v_mul_u32_u24_e32 v1, s1, v1 +; GFX906-NEXT: s_mul_i32 s0, s2, s3 +; GFX906-NEXT: v_mul_u32_u24_e32 v1, s3, v1 ; GFX906-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX906-NEXT: v_add_lshl_u32 v2, v0, v2, 4 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_mov_b32 s4, 0 ; GFX906-NEXT: v_mov_b32_e32 v1, v0 ; GFX906-NEXT: s_cselect_b32 s5, 1, 0 +; GFX906-NEXT: s_mov_b64 s[2:3], exec ; GFX906-NEXT: ds_write_b64 v2, v[0:1] ; GFX906-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: s_waitcnt vmcnt(3) diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index dae46361b9bcca..901e88a4c6aca8 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -20,7 +20,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -53,7 +53,7 @@ define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -101,7 +101,7 @@ define amdgpu_ps void @test_vopc_vcmp(float %x) { define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_2xf16: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_2xf16: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -138,25 +138,25 @@ define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 { ; GFX1032-LABEL: test_vopc_class: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032-NEXT: v_cmp_class_f32_e64 s0, s4, 0x204 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vopc_class: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 0x204 -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 0x204 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1064-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 @@ -169,27 +169,27 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 ; GFX1032-LABEL: test_vcmp_vcnd_f16: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, s4 ; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc_lo -; GFX1032-NEXT: global_store_short v1, v0, s[0:1] +; GFX1032-NEXT: global_store_short v1, v0, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vcmp_vcnd_f16: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: v_cmp_neq_f16_e64 vcc, 0x7c00, s4 ; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc -; GFX1064-NEXT: global_store_short v1, v0, s[0:1] +; GFX1064-NEXT: global_store_short v1, v0, s[2:3] ; GFX1064-NEXT: s_endpgm %cmp = fcmp oeq half %x, 0x7FF0000000000000 %sel = select i1 %cmp, half 1.0, half %x @@ -200,7 +200,7 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -214,7 +214,7 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -239,7 +239,7 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -253,7 +253,7 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -292,7 +292,7 @@ define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -318,10 +318,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_mask_if: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 10, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; %if -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dword v0, v0, s[0:1] @@ -331,10 +331,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1064-LABEL: test_mask_if: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 10, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; %if -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dword v0, v0, s[0:1] @@ -355,7 +355,7 @@ endif: define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_loop_with_if: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -417,7 +417,7 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_loop_with_if: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -516,42 +516,42 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-LABEL: test_loop_with_if_else_break: ; GFX1032: ; %bb.0: ; %bb ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: ; %bb.1: ; %.preheader -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: ; implicit-def: $sgpr3 +; GFX1032-NEXT: s_mov_b32 s3, 0 +; GFX1032-NEXT: ; implicit-def: $sgpr4 ; GFX1032-NEXT: s_branch .LBB11_4 ; GFX1032-NEXT: .LBB11_2: ; %bb8 ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_add_i32 s2, s2, 1 +; GFX1032-NEXT: s_add_i32 s3, s3, 1 ; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] -; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s2, v1 +; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1 ; GFX1032-NEXT: s_add_u32 s0, s0, 4 ; GFX1032-NEXT: s_addc_u32 s1, s1, 0 -; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo -; GFX1032-NEXT: s_or_b32 s3, s3, s5 +; GFX1032-NEXT: s_or_b32 s4, s4, s5 ; GFX1032-NEXT: .LBB11_3: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_and_b32 s5, exec_lo, s3 -; GFX1032-NEXT: s_or_b32 s4, s5, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, s5, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: .LBB11_4: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v3, v2, s[0:1] -; GFX1032-NEXT: s_or_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3 ; GFX1032-NEXT: s_cbranch_vccz .LBB11_2 ; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: ; implicit-def: $sgpr2 +; GFX1032-NEXT: ; implicit-def: $sgpr3 ; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX1032-NEXT: s_branch .LBB11_3 ; GFX1032-NEXT: .LBB11_6: ; %.loopexit @@ -561,10 +561,10 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1064: ; %bb.0: ; %bb ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_6 ; GFX1064-NEXT: ; %bb.1: ; %.preheader -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 @@ -631,7 +631,7 @@ bb8: define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_addc_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -643,7 +643,7 @@ define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 ; ; GFX1064-LABEL: test_addc_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -664,7 +664,7 @@ bb: define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subbrev_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -676,7 +676,7 @@ define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) ; ; GFX1064-LABEL: test_subbrev_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -697,7 +697,7 @@ bb: define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subb_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -709,7 +709,7 @@ define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 ; ; GFX1064-LABEL: test_subb_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -730,7 +730,7 @@ bb: define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_udiv64: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -892,7 +892,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_udiv64: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1063,7 +1063,7 @@ bb: define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1077,7 +1077,7 @@ define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX1064-LABEL: test_div_scale_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1104,33 +1104,31 @@ define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc +; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_scale_f64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc +; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -1188,8 +1186,8 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1032-LABEL: test_div_fmas_f32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s5 @@ -1197,14 +1195,14 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1032-NEXT: s_bitcmp1_b32 s7, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] +; GFX1032-NEXT: global_store_dword v2, v0, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_fmas_f32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s5 @@ -1212,7 +1210,7 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1064-NEXT: s_bitcmp1_b32 s7, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX1064-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1064-NEXT: global_store_dword v2, v0, s[0:1] +; GFX1064-NEXT: global_store_dword v2, v0, s[2:3] ; GFX1064-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 @@ -1223,14 +1221,14 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX1032-LABEL: test_div_fmas_f64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: v_mov_b32_e32 v2, s10 ; GFX1032-NEXT: v_mov_b32_e32 v3, s11 -; GFX1032-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 s2, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -1240,14 +1238,14 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX1064-LABEL: test_div_fmas_f64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s9 ; GFX1064-NEXT: v_mov_b32_e32 v2, s10 ; GFX1064-NEXT: v_mov_b32_e32 v3, s11 -; GFX1064-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 s2, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX1064-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -1263,9 +1261,11 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 { ; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1290,9 +1290,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 ; GFX1064-NEXT: s_mov_b64 vcc, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1343,7 +1344,7 @@ exit: define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX1032-LABEL: fdiv_f32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX1032-NEXT: v_rcp_f32_e32 v1, v0 @@ -1362,7 +1363,7 @@ define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) # ; ; GFX1064-LABEL: fdiv_f32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v0, s[4:5], s3, s3, s2 ; GFX1064-NEXT: v_rcp_f32_e32 v1, v0 @@ -1388,13 +1389,13 @@ define amdgpu_kernel void @test_br_cc_f16( ; GFX1032-LABEL: test_br_cc_f16: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1032-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX1032-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v2 ; GFX1032-NEXT: s_cbranch_vccnz .LBB24_2 @@ -1408,13 +1409,13 @@ define amdgpu_kernel void @test_br_cc_f16( ; GFX1064-LABEL: test_br_cc_f16: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1064-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX1064-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v2 ; GFX1064-NEXT: s_cbranch_vccnz .LBB24_2 @@ -1445,12 +1446,12 @@ two: define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 { ; GCN-LABEL: test_brcc_i1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp0_b32 s0, 0 +; GCN-NEXT: s_bitcmp0_b32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB25_2 ; GCN-NEXT: ; %bb.1: ; %store -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0xde ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1472,14 +1473,14 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1032-LABEL: test_preserve_condition_undef_flag: ; GFX1032: ; %bb.0: ; %bb0 ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dword s1, s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s2, s0, 1.0 -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s1, 1.0 -; GFX1032-NEXT: v_cmp_ngt_f32_e64 s0, s0, 0 -; GFX1032-NEXT: s_or_b32 s1, s2, s1 -; GFX1032-NEXT: s_or_b32 s0, s1, s0 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s0, s2, 1.0 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s3, 1.0 +; GFX1032-NEXT: v_cmp_ngt_f32_e64 s2, s2, 0 +; GFX1032-NEXT: s_or_b32 s0, s0, s1 +; GFX1032-NEXT: s_or_b32 s0, s0, s2 ; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_vccnz .LBB26_2 ; GFX1032-NEXT: ; %bb.1: ; %bb1 @@ -1492,11 +1493,11 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1064-LABEL: test_preserve_condition_undef_flag: ; GFX1064: ; %bb.0: ; %bb0 ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dword s5, s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, 1.0 -; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s5, 1.0 +; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, 1.0 ; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[4:5], s4, 0 ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] @@ -1530,7 +1531,7 @@ bb2: define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX1032-NEXT: ; implicit-def: $sgpr1 ; GFX1032-NEXT: ; implicit-def: $sgpr2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1568,7 +1569,7 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; ; GFX1064-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX1064-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX1064-NEXT: ; implicit-def: $sgpr4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1633,7 +1634,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo @@ -1648,7 +1649,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1671,29 +1672,29 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 ; GFX1032-LABEL: test_set_inactive: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v0, 42 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032-NEXT: global_store_dword v1, v0, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_set_inactive: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v0, 42 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064-NEXT: global_store_dword v1, v0, s[2:3] ; GFX1064-NEXT: s_endpgm %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) store i32 %tmp, ptr addrspace(1) %out @@ -1703,7 +1704,7 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 { ; GFX1032-LABEL: test_set_inactive_64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s2 @@ -1717,7 +1718,7 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) ; ; GFX1064-LABEL: test_set_inactive_64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s2 @@ -2137,7 +2138,7 @@ main_body: define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -2147,7 +2148,7 @@ define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, ; ; GFX1064-LABEL: test_intr_fcmp_i64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -2165,26 +2166,26 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; GFX1032-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_icmp_i64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4 +; GFX1064-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1064-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) store i64 %result, ptr addrspace(1) %out @@ -2194,7 +2195,7 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -2204,7 +2205,7 @@ define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, ; ; GFX1064-LABEL: test_intr_fcmp_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -2221,25 +2222,25 @@ define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_icmp_i32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1064-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -2353,7 +2354,7 @@ define amdgpu_ps float @test_ps_live() #0 { define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2373,7 +2374,7 @@ define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, pt ; ; GFX1064-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2470,7 +2471,7 @@ main_body: define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2504,7 +2505,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2565,7 +2566,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-LABEL: fcmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2597,7 +2598,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; ; GFX1064-LABEL: fcmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 @@ -2657,7 +2658,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2691,7 +2692,7 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2751,7 +2752,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032-LABEL: fcmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2783,7 +2784,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; ; GFX1064-LABEL: fcmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 978ac548443f73..e0b320aa4f3727 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -22,7 +22,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i16_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -36,7 +36,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i16_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -59,7 +59,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load_zext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -76,7 +76,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar ; ; VI-LABEL: widen_i16_constant_load_zext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -91,7 +91,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar ; ; GFX11-LABEL: widen_i16_constant_load_zext_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load_sext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -134,7 +134,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar ; ; VI-LABEL: widen_i16_constant_load_sext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,7 +149,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar ; ; GFX11-LABEL: widen_i16_constant_load_sext_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -175,7 +175,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i17_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -199,7 +199,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i17_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, 2 @@ -218,7 +218,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i17_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -247,7 +247,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_f16_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -263,7 +263,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_f16_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -275,7 +275,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_f16_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -296,7 +296,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_v2i8_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -317,7 +317,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_v2i8_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 44 ; VI-NEXT: v_mov_b32_e32 v1, 3 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -338,7 +338,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_v2i8_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -368,7 +368,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) %arg) { ; SI-LABEL: no_widen_i16_constant_divergent_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -387,7 +387,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; ; VI-LABEL: no_widen_i16_constant_divergent_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -404,9 +404,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; ; GFX11-LABEL: no_widen_i16_constant_divergent_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] @@ -433,7 +431,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i1_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -448,7 +446,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i1_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -461,7 +459,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i1_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -482,7 +480,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_zextload_i64_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -499,7 +497,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) ; ; VI-LABEL: widen_i16_zextload_i64_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -514,7 +512,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) ; ; GFX11-LABEL: widen_i16_zextload_i64_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -540,7 +538,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i1_zext_to_i64_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -558,7 +556,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; ; VI-LABEL: widen_i1_zext_to_i64_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -574,7 +572,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; ; GFX11-LABEL: widen_i1_zext_to_i64_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -598,7 +596,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; SI-LABEL: widen_i16_constant32_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -614,7 +612,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; ; VI-LABEL: widen_i16_constant32_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -629,7 +627,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; ; GFX11-LABEL: widen_i16_constant32_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -653,7 +651,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg) { ; SI-LABEL: widen_i16_global_invariant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -669,7 +667,7 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg ; ; VI-LABEL: widen_i16_global_invariant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -683,7 +681,7 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg ; ; GFX11-LABEL: widen_i16_global_invariant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index d31c9e7e03e793..40e4692a18ec79 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ; ; GFX9-LABEL: workgroup_id_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -17,7 +17,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ; ; GFX12-LABEL: workgroup_id_x: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] @@ -33,26 +33,24 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry) { ; GFX9-LABEL: workgroup_id_xy: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_and_b32 s4, ttmp7, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, ttmp7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: global_store_dword v1, v2, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xy: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: s_and_b32 s4, ttmp7, 0xffff -; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 -; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, ttmp7 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: global_store_b32 v1, v2, s[2:3] +; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -67,14 +65,14 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) { ; GFX9-LABEL: workgroup_id_xyz: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_and_b32 s8, ttmp7, 0xffff +; GFX9-NEXT: s_and_b32 s6, ttmp7, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_lshr_b32 s0, ttmp7, 16 ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -84,8 +82,8 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX12-LABEL: workgroup_id_xyz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX12-NEXT: s_and_b32 s2, ttmp7, 0xffff ; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_lshr_b32 s3, ttmp7, 16 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index a74dbe1de0d39e..def51f2b16d3e9 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -438,49 +438,33 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; ; GFX9-O3-LABEL: call: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s26, -1 -; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s14, -1 +; GFX9-O3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 +; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_getpc_b64 s[8:9] +; GFX9-O3-NEXT: s_add_u32 s8, s8, called@rel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s9, s9, called@rel32@hi+12 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 -; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O3-NEXT: s_getpc_b64 s[22:23] -; GFX9-O3-NEXT: s_add_u32 s22, s22, called@rel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s23, s23, called@rel32@hi+12 -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-O3-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm @@ -705,57 +689,42 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; ; GFX9-O3-LABEL: call_i64: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s26, -1 -; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s14, -1 +; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 +; GFX9-O3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O3-NEXT: s_getpc_b64 s[4:5] +; GFX9-O3-NEXT: s_add_u32 s4, s4, called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s5, s5, called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s3 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-O3-NEXT: s_getpc_b64 s[2:3] -; GFX9-O3-NEXT: s_add_u32 s2, s2, called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s3, s3, called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 -; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 -; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 offset:4 +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm @@ -1339,49 +1308,33 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; ; GFX9-O3-LABEL: strict_wwm_call: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s26, -1 -; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s14, -1 +; GFX9-O3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 +; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_getpc_b64 s[8:9] +; GFX9-O3-NEXT: s_add_u32 s8, s8, strict_wwm_called@rel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s9, s9, strict_wwm_called@rel32@hi+12 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 -; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O3-NEXT: s_getpc_b64 s[22:23] -; GFX9-O3-NEXT: s_add_u32 s22, s22, strict_wwm_called@rel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s23, s23, strict_wwm_called@rel32@hi+12 -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-O3-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm @@ -1606,57 +1559,42 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; ; GFX9-O3-LABEL: strict_wwm_call_i64: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s26, -1 -; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s14, -1 +; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 +; GFX9-O3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O3-NEXT: s_getpc_b64 s[4:5] +; GFX9-O3-NEXT: s_add_u32 s4, s4, strict_wwm_called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s5, s5, strict_wwm_called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s3 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-O3-NEXT: s_getpc_b64 s[2:3] -; GFX9-O3-NEXT: s_add_u32 s2, s2, strict_wwm_called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s3, s3, strict_wwm_called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 -; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 -; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 offset:4 +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 9fac17f33d0d36..e15fd7f29671a4 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -5,31 +5,31 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -54,33 +54,33 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v3, v7, v3 ; SI-NEXT: v_xor_b32_e32 v2, v6, v2 ; SI-NEXT: v_xor_b32_e32 v1, v5, v1 ; SI-NEXT: v_xor_b32_e32 v0, v4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -107,8 +107,8 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s2, s10 @@ -133,8 +133,8 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; ; VI-LABEL: xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -165,32 +165,32 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: v_xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -216,30 +216,30 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0 define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -263,7 +263,7 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: scalar_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -276,7 +276,7 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: scalar_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -292,8 +292,8 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: scalar_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -304,10 +304,10 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; ; VI-LABEL: scalar_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_not_b32 s2, s4 +; VI-NEXT: s_not_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -321,7 +321,7 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -360,31 +360,31 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -409,8 +409,8 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -424,8 +424,8 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: scalar_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] @@ -442,7 +442,7 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; SI-LABEL: scalar_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -456,7 +456,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: scalar_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -473,7 +473,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -492,7 +492,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -514,7 +514,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) { ; SI-LABEL: xor_cf: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -545,7 +545,7 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; ; VI-LABEL: xor_cf: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -591,8 +591,8 @@ endif: define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -605,15 +605,15 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: scalar_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s1, s1, 0xf237b -; VI-NEXT: s_xor_b32 s0, s0, 0x3039 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_xor_b32 s3, s3, 0xf237b +; VI-NEXT: s_xor_b32 s2, s2, 0x3039 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %or = xor i64 %a, 4261135838621753 @@ -624,30 +624,30 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_literal_multi_use_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x13 ; SI-NEXT: s_movk_i32 s8, 0x3039 ; SI-NEXT: s_mov_b32 s9, 0xf237b -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: s_add_u32 s0, s6, 0x3039 -; SI-NEXT: s_addc_u32 s1, s7, 0xf237b +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: s_add_u32 s0, s2, 0x3039 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_addc_u32 s1, s3, 0xf237b ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_xor_literal_multi_use_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x3039 ; VI-NEXT: s_mov_b32 s3, 0xf237b ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -675,8 +675,8 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -688,14 +688,14 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: scalar_xor_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s0, s0, 63 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_xor_b32 s2, s2, 63 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %or = xor i64 %a, 63 @@ -706,8 +706,8 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_neg_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -719,14 +719,14 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; ; VI-LABEL: scalar_xor_neg_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], -8 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm %or = xor i64 %a, -8 @@ -737,7 +737,7 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_i64_neg_inline_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -756,7 +756,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ; ; VI-LABEL: vector_xor_i64_neg_inline_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -777,7 +777,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -796,7 +796,7 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: vector_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll index 28da8ac423107c..f9a7e887ada239 100644 --- a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: zext_i16_to_i32_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -25,9 +25,9 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @zext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: zext_i16_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -47,8 +47,8 @@ define amdgpu_kernel void @zext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @zext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: zext_i16_to_i32_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -68,8 +68,8 @@ define amdgpu_kernel void @zext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @zext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: zext_i16_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll index 3b4ebef1529676..2588d88b002b8b 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll @@ -525,8 +525,6 @@ } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2 - -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } !0 = !{} diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll index 138106632c1bc8..9939366e855c41 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -56,4 +56,4 @@ define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { ret void } -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index 3f6f0c909e8bbf..8922a233b1d8fb 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -43,7 +43,7 @@ ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: body: -define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 { +define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -66,5 +66,5 @@ bb4: ret void } -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index b3ed7376a1ede6..8326d95e0e7f21 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -27,16 +27,10 @@ ; CHECK-NEXT: returnsVoid: true ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; CHECK-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } -; CHECK-NEXT: dispatchID: { reg: '$sgpr8_sgpr9' } -; CHECK-NEXT: workGroupIDX: { reg: '$sgpr10' } -; CHECK-NEXT: workGroupIDY: { reg: '$sgpr11' } -; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr12' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr13' } +; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } -; CHECK-NEXT: workItemIDY: { reg: '$vgpr1' } -; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 ; CHECK-NEXT: mode: diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index d9c3c4b17090bd..f8d97c81698cd7 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -8,11 +8,11 @@ declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) # define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-LABEL: InferNothing: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s4 ; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -34,12 +34,12 @@ entry: define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFadd: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -58,12 +58,12 @@ entry: define protected amdgpu_kernel void @InferFmax(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFmax: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -82,12 +82,12 @@ entry: define protected amdgpu_kernel void @InferFmin(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFmin: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -106,13 +106,13 @@ entry: define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) { ; CHECK-LABEL: InferMixed: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: v_mov_b32_e32 v0, s8 ; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: s_add_u32 s0, s4, s0 @@ -140,11 +140,11 @@ bb1: define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferPHI: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: s_addc_u32 s1, s5, s1 ; CHECK-NEXT: s_add_u32 s2, s0, -8 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index 38b8ba12f06626..06a8a6fa04828b 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -5,8 +5,8 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-LABEL: i64_test: ; CHECK: SelectionDAG has 25 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 -; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 +; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %1 ; CHECK-NEXT: t49: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> ; CHECK-NEXT: t26: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t29: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 @@ -28,7 +28,7 @@ define i64 @i32_test(i32 %i) nounwind readnone { ; CHECK-LABEL: i32_test: ; CHECK: SelectionDAG has 15 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 ; CHECK-NEXT: t6: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t7: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t6, TargetConstant:i1<0> ; CHECK-NEXT: t14: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7 @@ -47,7 +47,7 @@ define i64 @i16_test(i16 %i) nounwind readnone { ; CHECK-LABEL: i16_test: ; CHECK: SelectionDAG has 18 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 ; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_USHORT_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0> ; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<65535> @@ -68,7 +68,7 @@ define i64 @i8_test(i8 %i) nounwind readnone { ; CHECK-LABEL: i8_test: ; CHECK: SelectionDAG has 18 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 ; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_UBYTE_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0> ; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<255>